Spaces:
Runtime error
Runtime error
import pandas as pd | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer | |
import csv | |
import yaml | |
from datasets import Dataset | |
import tensorflow as tf | |
# Check TensorFlow GPU availability | |
print("GPUs Available: ", tf.config.list_physical_devices('GPU')) | |
import os | |
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0' | |
def load_data_and_config(data_path): | |
"""Loads training data from CSV.""" | |
data = [] | |
with open(data_path, newline='', encoding='utf-8') as csvfile: | |
reader = csv.DictReader(csvfile, delimiter=';') # Ensure delimiter matches your CSV file | |
for row in reader: | |
data.append({'text': row['description']}) # Changed from 'text' to 'description' | |
return data | |
def generate_api_query(model, tokenizer, prompt, desired_output, api_name, base_url): | |
"""Generates an API query using a fine-tuned model.""" | |
input_ids = tokenizer.encode(prompt + f" Write an API query to {api_name} to get {desired_output}", return_tensors="pt") | |
input_ids = input_ids.to(model.device) # Ensure input_ids are on the same device as the model | |
output = model.generate(input_ids, max_length=256, temperature=0.7, do_sample=True) # Enable sampling with temperature control | |
query = tokenizer.decode(output[0], skip_special_tokens=True) | |
return f"{base_url}/{query}" | |
from transformers import TrainingArguments, Trainer | |
def train_model(model, tokenizer, data): | |
"""Trains the model using the Hugging Face Trainer API.""" | |
# Encode data and prepare labels | |
inputs = [tokenizer(d['text'], max_length=512, truncation=True, padding='max_length', return_tensors="pt") for d in data] | |
dataset = Dataset.from_dict({ | |
'input_ids': [x['input_ids'].squeeze() for x in inputs], # remove extra dimensions | |
'labels': [x['input_ids'].squeeze() for x in inputs] | |
}) | |
training_args = TrainingArguments( | |
output_dir='./results', | |
num_train_epochs=3, | |
per_device_train_batch_size=1, | |
gradient_accumulation_steps=1, | |
warmup_steps=500, | |
weight_decay=0.01, | |
logging_dir='./logs', | |
logging_steps=10, | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=dataset, | |
tokenizer=tokenizer | |
) | |
# The Trainer handles the training loop internally | |
trainer.train() | |
# Optionally clear cache if using GPU or MPS | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
elif torch.backends.mps.is_built(): | |
torch.mps.empty_cache() | |
# Perform any remaining steps such as logging, saving, etc. | |
trainer.save_model() | |
def main(api_name, base_url): | |
# Load data | |
data = load_data_and_config("train2.csv") | |
# Load tokenizer and model | |
tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small") | |
model = AutoModelForCausalLM.from_pretrained("thenlper/gte-small") | |
# Train the model on your dataset | |
train_model(model, tokenizer, data) | |
# Save the fine-tuned model | |
model.save_pretrained("./fine_tuned_model") | |
tokenizer.save_pretrained("./fine_tuned_model") | |
# Example usage | |
prompt = "I need to retrieve the latest block on chain using a python script" | |
api_query = generate_api_query(model, tokenizer, prompt, "latest block on chain", api_name, base_url) | |
print(f"Generated code: {api_query}") | |
if __name__ == "__main__": | |
api_name = "Koios" | |
base_url = "https://api.koios.rest" | |
main(api_name, base_url) | |