|
--- |
|
license: apache-2.0 |
|
language: |
|
- en |
|
metrics: |
|
- accuracy |
|
library_name: transformers |
|
pipeline_tag: text2text-generation |
|
--- |
|
|
|
!pip -q install datasets |
|
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git |
|
!pip install -q bitsandbytes |
|
!pip -q install einops |
|
|
|
import datasets |
|
from datasets import load_dataset |
|
dataset=load_dataset("diabolic6045/flanv2_cot_alpeca" , split="train") |
|
|
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
|
|
|
model_name="TinyPixel/Llama-2-7B-bf16-sharded" |
|
|
|
bnb_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_compute_dtype=torch.float16, |
|
) |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
quantization_config=bnb_config, |
|
trust_remote_code=True |
|
) |
|
model.config.use_cache = False |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
filtered_dataset = [] |
|
for example in dataset: |
|
response = example['output'] |
|
response_tokens = tokenizer.tokenize(response) |
|
if len(response_tokens) >= 100: |
|
filtered_dataset.append(example) |
|
|
|
from datasets import Dataset |
|
dict_of_lists = {key: [example[key] for example in filtered_dataset] for key in filtered_dataset[0]} |
|
dataset = Dataset.from_dict(dict_of_lists) |
|
|
|
|
|
from collections import defaultdict |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
|
|
responses = [example["output"] for example in dataset] |
|
tokenized_responses = [tokenizer.tokenize(response) for response in responses] |
|
|
|
tfidf_vectorizer = TfidfVectorizer() |
|
tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(tokens) for tokens in tokenized_responses]) |
|
|
|
cos_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix) |
|
|
|
threshold = 0.95 |
|
grouped_responses = defaultdict(list) |
|
for i in range(len(responses)): |
|
grouped_responses[i].append(i) # Include the response itself in the group |
|
for j in range(i + 1, len(responses)): |
|
if cos_sim_matrix[i, j] > threshold: |
|
grouped_responses[i].append(j) |
|
grouped_responses[j].append(i) |
|
|
|
deduplicated_responses = [] |
|
for group in grouped_responses.values(): |
|
deduplicated_responses.append(responses[group[0]]) |
|
|
|
# Create a new dataset from deduplicated responses |
|
deduplicated_dataset = Dataset.from_dict({"input": deduplicated_responses}) |
|
|
|
from peft import LoraConfig , get_peft_model |
|
|
|
lora_alpha=16 |
|
lora_dropout = 0.1 |
|
lora_r= 64 |
|
|
|
peft_model=LoraConfig( |
|
lora_alpha= lora_alpha, |
|
lora_dropout = lora_dropout, |
|
r= lora_r, |
|
bias="none", |
|
task_type="CASUAL_LM" |
|
) |
|
|
|
from transformers import TrainingArguments |
|
|
|
output_dir="./results" |
|
per_device_train_batch_size = 4 |
|
gradient_accumulation_steps = 2 |
|
optim="paged_adamw_8bit" |
|
save_steps = 100 |
|
logging_steps = 10 |
|
learning_rate = 2e-4 |
|
max_grad_norm = 1 |
|
max_steps = 100 |
|
warmup_ratio = 0.03 |
|
lr_scheduler_type = "constant" |
|
|
|
training_argumet=TrainingArguments( |
|
output_dir=output_dir, |
|
per_device_train_batch_size=per_device_train_batch_size, |
|
gradient_accumulation_steps=gradient_accumulation_steps, |
|
optim=optim, |
|
save_steps=save_steps, |
|
logging_steps=logging_steps, |
|
learning_rate=learning_rate, |
|
fp16=True, |
|
max_grad_norm=max_grad_norm, |
|
max_steps=max_steps, |
|
warmup_ratio=warmup_ratio, |
|
group_by_length=True, |
|
lr_scheduler_type =lr_scheduler_type |
|
) |
|
|
|
from trl import SFTTrainer |
|
|
|
max_seq_length= 512 |
|
|
|
model.train() |
|
for name, param in model.named_parameters(): |
|
if param.dtype in [torch.float32, torch.float64, torch.complex64, torch.complex128]: |
|
param.requires_grad = True |
|
|
|
with torch.autograd.enable_grad(): |
|
trainer = SFTTrainer( |
|
model=model, |
|
train_dataset=deduplicated_dataset, |
|
peft_config=peft_model, |
|
dataset_text_field="input", |
|
max_seq_length=max_seq_length, |
|
tokenizer=tokenizer, |
|
args=training_argumet |
|
) |
|
|
|
import argparse |
|
import torch |
|
|
|
parser = argparse.ArgumentParser(description='PyTorch Example') |
|
parser.add_argument('--disable-cuda', action='store_true', |
|
help='Disable CUDA') |
|
args, unknown = parser.parse_known_args() |
|
args.device = None |
|
if not args.disable_cuda and torch.cuda.is_available(): |
|
args.device = torch.device('cuda') |
|
else: |
|
args.device = torch.device('cpu') |
|
|
|
|
|
for name, module in trainer.model.named_modules(): |
|
if "norm" in name: |
|
module = module.to(torch.float16) |
|
|
|
trainer.train() |
|
|
|
model_to_save=trainer.model.module if hasattr(trainer.model , 'module') else trainer.model |
|
model_to_save.save_pretrained("output") |
|
|
|
lora_config=LoraConfig.from_pretrained('output') |
|
model=get_peft_model(model , lora_config) |
|
|
|
dataset['input'] |
|
|
|
input="Given the sentence :A gathering of people with a young man playing a guitar. is it true that :A single woman is watching a band of guitar players.?" |
|
device = "cuda:0" |
|
|
|
|
|
inputs=tokenizer(input, return_tensors="pt").to(device) |
|
output=model.generate(**inputs , max_new_tokens=50) |
|
print(tokenizer.decode(output[0],skip_special_tokens=True)) |
|
|
|
from huggingface_hub import login |
|
login() |
|
|
|
model.push_to_hub("llama2_flan_v2_F.T") |
|
|
|
|
|
|