stakelovelace
2
3b6b2b0
raw
history blame
3.51 kB
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
import csv
import yaml
from datasets import Dataset
import tensorflow as tf
# Check TensorFlow GPU availability
print("GPUs Available: ", tf.config.list_physical_devices('GPU'))
import os
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
def load_data_and_config(data_path):
"""Loads training data from CSV."""
data = []
with open(data_path, newline='', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile, delimiter=';') # Ensure delimiter matches your CSV file
for row in reader:
data.append({'text': row['description']}) # Changed from 'text' to 'description'
return data
def generate_api_query(model, tokenizer, prompt, desired_output, api_name, base_url):
"""Generates an API query using a fine-tuned model."""
input_ids = tokenizer.encode(prompt + f" Write an API query to {api_name} to get {desired_output}", return_tensors="pt")
input_ids = input_ids.to(model.device) # Ensure input_ids are on the same device as the model
output = model.generate(input_ids, max_length=256, temperature=0.7, do_sample=True) # Enable sampling with temperature control
query = tokenizer.decode(output[0], skip_special_tokens=True)
return f"{base_url}/{query}"
from transformers import TrainingArguments, Trainer
def train_model(model, tokenizer, data):
"""Trains the model using the Hugging Face Trainer API."""
# Encode data and prepare labels
inputs = [tokenizer(d['text'], max_length=512, truncation=True, padding='max_length', return_tensors="pt") for d in data]
dataset = Dataset.from_dict({
'input_ids': [x['input_ids'].squeeze() for x in inputs], # remove extra dimensions
'labels': [x['input_ids'].squeeze() for x in inputs]
})
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=1,
gradient_accumulation_steps=1,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
tokenizer=tokenizer
)
# The Trainer handles the training loop internally
trainer.train()
# Optionally clear cache if using GPU or MPS
if torch.cuda.is_available():
torch.cuda.empty_cache()
elif torch.backends.mps.is_built():
torch.mps.empty_cache()
# Perform any remaining steps such as logging, saving, etc.
trainer.save_model()
def main(api_name, base_url):
# Load data
data = load_data_and_config("train2.csv")
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")
model = AutoModelForCausalLM.from_pretrained("thenlper/gte-small")
# Train the model on your dataset
train_model(model, tokenizer, data)
# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
# Example usage
prompt = "I need to retrieve the latest block on chain using a python script"
api_query = generate_api_query(model, tokenizer, prompt, "latest block on chain", api_name, base_url)
print(f"Generated code: {api_query}")
if __name__ == "__main__":
api_name = "Koios"
base_url = "https://api.koios.rest"
main(api_name, base_url)