Spaces:
Build error
Build error
!pip install torch==2.0.1 transformers==4.27.1 datasets==2.4.0 wget==3.2 huggingface-hub==0.14.1 beautifulsoup4==4.11.1 requests==2.28.1 matplotlib tqdm python-dotenv diffusers | |
import os | |
import torch | |
import torch.nn as nn | |
from torch.utils.data import DataLoader, Dataset | |
from torch.optim import AdamW | |
import matplotlib.pyplot as plt | |
import matplotlib.animation as animation | |
import time | |
import threading | |
from tqdm import tqdm | |
from transformers import AutoTokenizer, AutoModel, TrainingArguments, pipeline | |
from diffusers import DiffusionPipeline | |
from huggingface_hub import login, HfApi, Repository | |
from dotenv import load_dotenv | |
# Cargar variables de entorno | |
load_dotenv() | |
class UnifiedModel(nn.Module): | |
def __init__(self, models): | |
super(UnifiedModel, self).__init__() | |
self.models = nn.ModuleList(models) | |
self.classifier = nn.Linear(sum([model.config.hidden_size for model in models if hasattr(model, 'config')]), 2) | |
def forward(self, inputs): | |
hidden_states = [] | |
for model in self.models: | |
if isinstance(model, nn.Module): | |
outputs = model(inputs) | |
hidden_states.append(outputs.last_hidden_state[:, 0, :]) | |
elif isinstance(model, DiffusionPipeline) or isinstance(model, pipeline): | |
outputs = model(inputs) | |
hidden_states.append(torch.tensor(outputs)) | |
concatenated_hidden_states = torch.cat(hidden_states, dim=-1) | |
logits = self.classifier(concatenated_hidden_states) | |
return logits | |
class SyntheticDataset(Dataset): | |
def __init__(self, tokenizers, size=100): | |
self.tokenizers = tokenizers | |
self.size = size | |
self.data = self._generate_data() | |
def _generate_data(self): | |
data = [] | |
for _ in range(self.size): | |
text = "This is a sample sentence for testing purposes." | |
label = torch.tensor(0) # Sample label | |
item = {"text": text, "label": label} | |
for name, tokenizer in self.tokenizers.items(): | |
tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=128) | |
item[f"input_ids_{name}"] = torch.tensor(tokenized["input_ids"]) | |
item[f"attention_mask_{name}"] = torch.tensor(tokenized["attention_mask"]) | |
data.append(item) | |
return data | |
def __len__(self): | |
return len(self.data) | |
def __getitem__(self, idx): | |
return self.data[idx] | |
def push_to_hub(local_dir, repo_name): | |
try: | |
repo_url = HfApi().create_repo(repo_name, exist_ok=True) | |
repo = Repository(local_dir, clone_from=repo_url) | |
if not os.path.exists(os.path.join(local_dir, ".git")): | |
os.system(f"cd {local_dir} && git init && git remote add origin {repo_url} && git pull origin main") | |
repo.git_add(auto_lfs_track=True) | |
repo.git_commit("Add model and tokenizer files") | |
json_files = ["config.json", "generation_config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer.model", "tokenizer_config.json"] | |
for json_file in json_files: | |
json_file_path = os.path.join(local_dir, json_file) | |
if os.path.exists(json_file_path): | |
repo.git_add(json_file_path) | |
repo.git_push() | |
print(f"Pushed model and tokenizer to {repo_url}") | |
except Exception as e: | |
print(f"Error pushing to Hugging Face Hub: {e}") | |
def main(): | |
while True: | |
try: | |
os.system("git config --global credential.helper store") | |
login(token=os.getenv("HUGGINGFACE_TOKEN"), add_to_git_credential=True) | |
# Definir los modelos que se van a utilizar | |
models_to_train = [ | |
"openai-community/gpt2-xl", | |
"google/gemma-2-9b-it", | |
"google/gemma-2-9b", | |
"meta-llama/Meta-Llama-3.1-8B-Instruct", | |
"meta-llama/Meta-Llama-3.1-8B", | |
"openbmb/MiniCPM-V-2_6", | |
"bigcode/starcoder", | |
"WizardLMTeam/WizardCoder-Python-34B-V1.0", | |
"Qwen/Qwen2-72B-Instruct", | |
"google/gemma-2-2b-it", | |
"facebook/bart-large-cnn", | |
"Falconsai/text_summarization", | |
"microsoft/speecht5_tts", | |
"Groq/Llama-3-Groq-70B-Tool-Use", | |
"Groq/Llama-3-Groq-8B-Tool-Use" | |
] | |
# Inicializar los pipelines | |
pipelines_to_unify = [ | |
pipeline("text-to-audio", model="facebook/musicgen-melody"), | |
pipeline("text-to-audio", model="facebook/musicgen-large"), | |
pipeline("text-to-audio", model="facebook/musicgen-small"), | |
DiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt-1-1"), | |
pipeline("automatic-speech-recognition", model="openai/whisper-small"), | |
DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev"), | |
DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1"), | |
DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell"), | |
pipeline("text-generation", model="meta-llama/Meta-Llama-3.1-8B"), | |
pipeline("text-generation", model="openbmb/MiniCPM-V-2_6"), | |
pipeline("text-generation", model="bigcode/starcoder"), | |
pipeline("text-to-speech", model="microsoft/speecht5_tts"), | |
pipeline("text-generation", model="WizardLMTeam/WizardCoder-Python-34B-V1.0"), | |
pipeline("text-generation", model="Qwen/Qwen2-72B-Instruct"), | |
pipeline("text-generation", model="google/gemma-2-2b-it"), | |
pipeline("summarization", model="facebook/bart-large-cnn"), | |
pipeline("summarization", model="Falconsai/text_summarization"), | |
DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev"), | |
pipeline("text-to-audio", model="facebook/musicgen-small"), | |
pipeline("text-generation", model="Groq/Llama-3-Groq-70B-Tool-Use"), | |
pipeline("text-generation", model="Groq/Llama-3-Groq-8B-Tool-Use") | |
] | |
tokenizers = {} | |
models = [] | |
for model_name in models_to_train: | |
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) | |
if tokenizer.pad_token is None: | |
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) | |
model = AutoModel.from_pretrained(model_name) | |
tokenizers[model_name] = tokenizer | |
models.append(model) | |
# Agregar pipelines como modelos | |
models.extend(pipelines_to_unify) | |
# Crear un dataset sint茅tico para entrenamiento y evaluaci贸n | |
synthetic_dataset = SyntheticDataset(tokenizers, size=100) | |
# Dividir el dataset en entrenamiento y evaluaci贸n | |
train_size = int(0.8 * len(synthetic_dataset)) | |
val_size = len(synthetic_dataset) - train_size | |
train_dataset, val_dataset = torch.utils.data.random_split(synthetic_dataset, [train_size, val_size]) | |
# Crear DataLoaders para entrenamiento y evaluaci贸n | |
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True) | |
eval_loader = DataLoader(val_dataset, batch_size=16) | |
# Unificar los modelos y pipelines en uno solo | |
unified_model = UnifiedModel(models) | |
unified_model.to(torch.device("cpu")) | |
# Mostrar la cantidad de par谩metros totales a entrenar | |
total_params = sum(p.numel() for p in unified_model.parameters()) | |
print(f"Total parameters to train: {total_params}") | |
# Definir los argumentos de entrenamiento | |
training_args = TrainingArguments( | |
output_dir="outputs/unified_model", | |
evaluation_strategy="epoch", | |
learning_rate=9e-4, | |
per_device_train_batch_size=2, | |
per_device_eval_batch_size=16, | |
num_train_epochs=1, # Reduced epochs for quick training | |
weight_decay=0.01, | |
logging_steps=10, # More frequent logging for quicker feedback | |
optim="adamw_hf" | |
) | |
# Definir el optimizador | |
optimizer = AdamW(unified_model.parameters(), lr=training_args.learning_rate) | |
train_losses = [] | |
eval_losses = [] | |
def train(model, train_loader, eval_loader, args): | |
model.train() | |
epoch = 0 | |
total_steps = args.num_train_epochs * len(train_loader) | |
progress_bar = tqdm(total=total_steps, desc="Training") | |
while epoch < args.num_train_epochs: | |
start_time = time.time() | |
for step, batch in enumerate(train_loader): | |
input_ids = [batch[f"input_ids_{name}"].to("cpu") for name in tokenizers.keys()] | |
attention_mask = [batch[f"attention_mask_{name}"].to("cpu") for name in tokenizers.keys()] | |
labels = batch["label"].to("cpu") | |
optimizer.zero_grad() | |
outputs = model(input_ids) | |
loss = nn.CrossEntropyLoss()(outputs, labels) | |
loss.backward() | |
optimizer.step() | |
progress_bar.update(1) | |
elapsed_time = time.time() - start_time | |
estimated_total_time = total_steps * (elapsed_time / (step + 1)) | |
estimated_remaining_time = estimated_total_time - elapsed_time | |
if step % args.logging_steps == 0: | |
train_losses.append(loss.item()) | |
print(f"Step {step}/{total_steps}, Loss: {loss.item()}, Estimated remaining time: {estimated_remaining_time:.2f} seconds") | |
epoch += 1 | |
model.eval() | |
eval_loss = 0 | |
with torch.no_grad(): | |
for batch in eval_loader: | |
input_ids = [batch[f"input_ids_{name}"].to("cpu") for name in tokenizers.keys()] | |
attention_mask = [batch[f"attention_mask_{name}"].to("cpu") for name in tokenizers.keys()] | |
labels = batch["label"].to("cpu") | |
outputs = model(input_ids) | |
loss = nn.CrossEntropyLoss()(outputs, labels) | |
eval_loss += loss.item() | |
eval_loss /= len(eval_loader) | |
eval_losses.append(eval_loss) | |
print(f"Epoch {epoch}/{args.num_train_epochs}, Evaluation Loss: {eval_loss}") | |
train(unified_model, train_loader, eval_loader, training_args) | |
# Visualizar p茅rdidas durante el entrenamiento | |
fig, ax = plt.subplots() | |
ax.set_xlabel("Epochs") | |
ax.set_ylabel("Loss") | |
ax.legend() | |
def animate(i): | |
ax.clear() | |
ax.plot(train_losses[:i], label="Train Loss") | |
ax.plot(eval_losses[:i], label="Eval Loss") | |
ax.legend() | |
ani = animation.FuncAnimation(fig, animate, frames=len(train_losses), blit=False) | |
plt.show() | |
# Subir el modelo unificado a Hugging Face Hub | |
local_dir = "./outputs/unified_model" | |
push_to_hub(local_dir, repo_name="Ffftdtd5dtft/my_model") | |
break | |
except Exception as e: | |
print(f"Error: {e}") | |
time.sleep(2) | |
if __name__ == "__main__": | |
main() |