Spaces:
Runtime error
Runtime error
import os | |
import torch | |
import torch.nn as nn | |
from torch.utils.data import DataLoader, Dataset | |
from torch.optim import AdamW | |
import matplotlib.pyplot as plt | |
import matplotlib.animation as animation | |
import time | |
from tqdm import tqdm | |
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments | |
from diffusers import DiffusionPipeline | |
from huggingface_hub import login, HfApi, Repository | |
from dotenv import load_dotenv | |
import gradio as gr | |
# Cargar variables de entorno | |
load_dotenv() | |
class UnifiedModel(nn.Module): | |
def __init__(self, models): | |
super(UnifiedModel, self).__init__() | |
self.models = nn.ModuleList(models) | |
self.classifier = nn.Linear(sum([model.config.hidden_size for model in models if hasattr(model, 'config')]), 2) | |
def forward(self, inputs): | |
hidden_states = [] | |
for model in self.models: | |
if isinstance(model, nn.Module): | |
outputs = model(**inputs) | |
hidden_states.append(outputs.last_hidden_state[:, 0, :]) | |
elif isinstance(model, DiffusionPipeline): | |
outputs = model(**inputs) | |
hidden_states.append(torch.tensor(outputs).float()) | |
concatenated_hidden_states = torch.cat(hidden_states, dim=-1) | |
logits = self.classifier(concatenated_hidden_states) | |
return logits | |
class SyntheticDataset(Dataset): | |
def __init__(self, tokenizers, size=100): | |
self.tokenizers = tokenizers | |
self.size = size | |
self.data = self._generate_data() | |
def _generate_data(self): | |
data = [] | |
for _ in range(self.size): | |
text = "This is a sample sentence for testing purposes." | |
label = torch.tensor(0) # Sample label | |
item = {"text": text, "label": label} | |
for name, tokenizer in self.tokenizers.items(): | |
tokenized = tokenizer(text, padding="max_length", truncation=True, max_length=128) | |
item[f"input_ids_{name}"] = torch.tensor(tokenized["input_ids"]) | |
item[f"attention_mask_{name}"] = torch.tensor(tokenized["attention_mask"]) | |
data.append(item) | |
return data | |
def __len__(self): | |
return len(self.data) | |
def __getitem__(self, idx): | |
return self.data[idx] | |
def push_to_hub(local_dir, repo_name): | |
try: | |
repo_url = HfApi().create_repo(repo_name, exist_ok=True) | |
repo = Repository(local_dir, clone_from=repo_url) | |
if not os.path.exists(os.path.join(local_dir, ".git")): | |
os.system(f"cd {local_dir} && git init && git remote add origin {repo_url} && git pull origin main") | |
repo.git_add(auto_lfs_track=True) | |
repo.git_commit("Add model and tokenizer files") | |
json_files = ["config.json", "generation_config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer.model", "tokenizer_config.json"] | |
for json_file in json_files: | |
json_file_path = os.path.join(local_dir, json_file) | |
if os.path.exists(json_file_path): | |
repo.git_add(json_file_path) | |
repo.git_push() | |
print(f"Pushed model and tokenizer to {repo_url}") | |
except Exception as e: | |
print(f"Error pushing to Hugging Face Hub: {e}") | |
def load_model(model_name): | |
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
return tokenizer, model | |
def train(model, train_loader, eval_loader, args): | |
model.train() | |
epoch = 0 | |
total_steps = len(train_loader) | |
for step, batch in enumerate(train_loader): | |
start_time = time.time() | |
input_ids = [batch[f"input_ids_{name}"].to("cpu") for name in tokenizers.keys()] | |
attention_mask = [batch[f"attention_mask_{name}"].to("cpu") for name in tokenizers.keys()] | |
labels = batch["label"].to("cpu") | |
optimizer.zero_grad() | |
outputs = model(input_ids) | |
loss = nn.CrossEntropyLoss()(outputs, labels) | |
loss.backward() | |
optimizer.step() | |
elapsed_time = time.time() - start_time | |
estimated_total_time = total_steps * (elapsed_time / (step + 1)) | |
estimated_remaining_time = estimated_total_time - elapsed_time | |
if step % args.logging_steps == 0: | |
train_losses.append(loss.item()) | |
print(f"Step {step}/{total_steps}, Loss: {loss.item()}, Estimated remaining time: {estimated_remaining_time:.2f} seconds") | |
epoch += 1 | |
model.eval() | |
eval_loss = 0 | |
with torch.no_grad(): | |
for batch in eval_loader: | |
input_ids = [batch[f"input_ids_{name}"].to("cpu") for name in tokenizers.keys()] | |
attention_mask = [batch[f"attention_mask_{name}"].to("cpu") for name in tokenizers.keys()] | |
labels = batch["label"].to("cpu") | |
outputs = model(input_ids) | |
loss = nn.CrossEntropyLoss()(outputs, labels) | |
eval_loss += loss.item() | |
eval_loss /= len(eval_loader) | |
eval_losses.append(eval_loss) | |
print(f"Epoch {epoch}/{args.num_train_epochs}, Evaluation Loss: {eval_loss}") | |
def gradio_interface(input_text): | |
# Define the Gradio interface function | |
tokenized_inputs = {name: tokenizer.encode(input_text, return_tensors="pt") for name, tokenizer in tokenizers.items()} | |
model_output = unified_model(tokenized_inputs) | |
return model_output | |
def main(): | |
while True: | |
try: | |
os.system("git config --global credential.helper store") | |
login(token=os.getenv("HUGGINGFACE_TOKEN"), add_to_git_credential=True) | |
# Definir los modelos que se van a utilizar | |
models_to_train = [ | |
"openai-community/gpt2-xl", | |
"google/gemma-2-9b-it", | |
"google/gemma-2-9b", | |
"meta-llama/Meta-Llama-3.1-8B-Instruct", | |
"meta-llama/Meta-Llama-3.1-8B", | |
"openbmb/MiniCPM-V-2_6", | |
"bigcode/starcoder", | |
"WizardLMTeam/WizardCoder-Python-34B-V1.0", | |
"Qwen/Qwen2-72B-Instruct", | |
"google/gemma-2-2b-it", | |
"facebook/bart-large-cnn", | |
"Falconsai/text_summarization", | |
"microsoft/speecht5_tts", | |
"Groq/Llama-3-Groq-70B-Tool-Use", | |
"Groq/Llama-3-Groq-8B-Tool-Use", | |
"facebook/musicgen-large", | |
"facebook/musicgen-melody", | |
"black-forest-labs/FLUX.1-schnell", | |
"facebook/musicgen-small", | |
"stabilityai/stable-video-diffusion-img2vid-xt-1-1", | |
"openai/whisper-small", | |
"black-forest-labs/FLUX.1-dev", | |
"stabilityai/stable-diffusion-2-1" | |
] | |
# Inicializar los modelos y tokenizadores | |
tokenizers = {} | |
models = [] | |
for model_name in models_to_train: | |
tokenizer, model = load_model(model_name) | |
tokenizers[model_name] = tokenizer | |
models.append(model) | |
# Crear un dataset sint茅tico para entrenamiento y evaluaci贸n | |
synthetic_dataset = SyntheticDataset(tokenizers, size=100) | |
# Dividir el dataset en entrenamiento y evaluaci贸n | |
train_size = int(0.8 * len(synthetic_dataset)) | |
val_size = len(synthetic_dataset) - train_size | |
train_dataset, val_dataset = torch.utils.data.random_split(synthetic_dataset, [train_size, val_size]) | |
# Crear DataLoaders para entrenamiento y evaluaci贸n | |
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True) | |
eval_loader = DataLoader(val_dataset, batch_size=16) | |
# Unificar los modelos en uno solo | |
unified_model = UnifiedModel(models) | |
unified_model.to(torch.device("cpu")) | |
# Mostrar la cantidad de par谩metros totales a entrenar | |
total_params = sum(p.numel() for p in unified_model.parameters()) | |
print(f"Total parameters to train: {total_params}") | |
# Definir los argumentos de entrenamiento | |
training_args = TrainingArguments( | |
per_device_train_batch_size=2, | |
per_device_eval_batch_size=16, | |
num_train_epochs=1, | |
logging_steps=10, | |
save_steps=10, | |
evaluation_strategy="steps" | |
) | |
# Definir el optimizador | |
optimizer = AdamW(unified_model.parameters(), lr=5e-5) | |
# Listas para almacenar las p茅rdidas | |
train_losses = [] | |
eval_losses = [] | |
# Entrenar el modelo | |
train(unified_model, train_loader, eval_loader, training_args) | |
# Visualizar p茅rdidas | |
fig, ax = plt.subplots() | |
ax.set_xlabel("Epochs") | |
ax.set_ylabel("Loss") | |
ax.plot(train_losses, label="Training Loss") | |
ax.plot(eval_losses, label="Evaluation Loss") | |
ax.legend() | |
def animate(i): | |
ax.clear() | |
ax.plot(train_losses, label="Training Loss") | |
ax.plot(eval_losses, label="Evaluation Loss") | |
ax.set_xlabel("Epochs") | |
ax.set_ylabel("Loss") | |
ax.legend() | |
ani = animation.FuncAnimation(fig, animate, interval=1000) | |
plt.show() | |
# Guardar el modelo y el tokenizador unificados | |
if not os.path.exists("./outputs/unified_model"): | |
os.makedirs("./outputs/unified_model") | |
# Guardar el modelo unificado en un directorio local | |
local_dir = "./outputs/unified_model" | |
torch.save(unified_model.state_dict(), os.path.join(local_dir, "pytorch_model.bin")) | |
# Guardar el tokenizador en un directorio local | |
for name, tokenizer in tokenizers.items(): | |
tokenizer.save_pretrained(local_dir) | |
# Subir el modelo y el tokenizador a Hugging Face | |
push_to_hub(local_dir, repo_name="Ffftdtd5dtft/my_model") | |
# Configurar y lanzar la interfaz Gradio | |
interface = gr.Interface(fn=gradio_interface, inputs="text", outputs="text") | |
interface.launch() | |
break | |
except Exception as e: | |
print(f"Error: {e}") | |
time.sleep(2) | |
if __name__ == "__main__": | |
main() | |