llama2_flan_v2_F.T / README.md
Apk02's picture
Update README.md
7d6136d verified
|
raw
history blame
No virus
5.16 kB
---
license: apache-2.0
language:
- en
metrics:
- accuracy
library_name: transformers
pipeline_tag: text2text-generation
---
!pip -q install datasets
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q bitsandbytes
!pip -q install einops
import datasets
from datasets import load_dataset
dataset=load_dataset("diabolic6045/flanv2_cot_alpeca" , split="train")
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
model_name="TinyPixel/Llama-2-7B-bf16-sharded"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
trust_remote_code=True
)
model.config.use_cache = False
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
filtered_dataset = []
for example in dataset:
response = example['output']
response_tokens = tokenizer.tokenize(response)
if len(response_tokens) >= 100:
filtered_dataset.append(example)
from datasets import Dataset
dict_of_lists = {key: [example[key] for example in filtered_dataset] for key in filtered_dataset[0]}
dataset = Dataset.from_dict(dict_of_lists)
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
responses = [example["output"] for example in dataset]
tokenized_responses = [tokenizer.tokenize(response) for response in responses]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(tokens) for tokens in tokenized_responses])
cos_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
threshold = 0.95
grouped_responses = defaultdict(list)
for i in range(len(responses)):
grouped_responses[i].append(i) # Include the response itself in the group
for j in range(i + 1, len(responses)):
if cos_sim_matrix[i, j] > threshold:
grouped_responses[i].append(j)
grouped_responses[j].append(i)
deduplicated_responses = []
for group in grouped_responses.values():
deduplicated_responses.append(responses[group[0]])
# Create a new dataset from deduplicated responses
deduplicated_dataset = Dataset.from_dict({"input": deduplicated_responses})
from peft import LoraConfig , get_peft_model
lora_alpha=16
lora_dropout = 0.1
lora_r= 64
peft_model=LoraConfig(
lora_alpha= lora_alpha,
lora_dropout = lora_dropout,
r= lora_r,
bias="none",
task_type="CASUAL_LM"
)
from transformers import TrainingArguments
output_dir="./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 2
optim="paged_adamw_8bit"
save_steps = 100
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 1
max_steps = 100
warmup_ratio = 0.03
lr_scheduler_type = "constant"
training_argumet=TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=optim,
save_steps=save_steps,
logging_steps=logging_steps,
learning_rate=learning_rate,
fp16=True,
max_grad_norm=max_grad_norm,
max_steps=max_steps,
warmup_ratio=warmup_ratio,
group_by_length=True,
lr_scheduler_type =lr_scheduler_type
)
from trl import SFTTrainer
max_seq_length= 512
model.train()
for name, param in model.named_parameters():
if param.dtype in [torch.float32, torch.float64, torch.complex64, torch.complex128]:
param.requires_grad = True
with torch.autograd.enable_grad():
trainer = SFTTrainer(
model=model,
train_dataset=deduplicated_dataset,
peft_config=peft_model,
dataset_text_field="input",
max_seq_length=max_seq_length,
tokenizer=tokenizer,
args=training_argumet
)
import argparse
import torch
parser = argparse.ArgumentParser(description='PyTorch Example')
parser.add_argument('--disable-cuda', action='store_true',
help='Disable CUDA')
args, unknown = parser.parse_known_args()
args.device = None
if not args.disable_cuda and torch.cuda.is_available():
args.device = torch.device('cuda')
else:
args.device = torch.device('cpu')
for name, module in trainer.model.named_modules():
if "norm" in name:
module = module.to(torch.float16)
trainer.train()
model_to_save=trainer.model.module if hasattr(trainer.model , 'module') else trainer.model
model_to_save.save_pretrained("output")
lora_config=LoraConfig.from_pretrained('output')
model=get_peft_model(model , lora_config)
dataset['input']
input="Given the sentence :A gathering of people with a young man playing a guitar. is it true that :A single woman is watching a band of guitar players.?"
device = "cuda:0"
inputs=tokenizer(input, return_tensors="pt").to(device)
output=model.generate(**inputs , max_new_tokens=50)
print(tokenizer.decode(output[0],skip_special_tokens=True))
from huggingface_hub import login
login()
model.push_to_hub("llama2_flan_v2_F.T")