Spaces:
Running
Running
import pandas as pd | |
import streamlit as st | |
from langchain_huggingface import HuggingFacePipeline | |
from langchain_core.prompts import PromptTemplate | |
from langchain.chains import LLMChain | |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline | |
from huggingface_hub import login | |
import torch | |
import json | |
from datetime import datetime | |
# Autenticaci贸n con Fireworks en Hugging Face | |
huggingface_token = st.secrets["FIREWORKS"] | |
login(huggingface_token) | |
# Configurar modelo Fireworks con cuantizaci贸n int8 | |
quant_config = BitsAndBytesConfig.from_model_type( | |
"int8", # Cuantizaci贸n para reducir el tama帽o y acelerar | |
quantization_scheme="gptq" | |
) | |
model_id = "fireworks-ai/firefunction-v2" | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
device_map="auto", | |
torch_dtype=torch.float16, | |
quantization_config=quant_config | |
) | |
# Establecer el token de relleno | |
if tokenizer.pad_token_id is None: | |
tokenizer.pad_token_id = tokenizer.eos_token_id | |
# Definir funciones espec铆ficas para Fireworks | |
function_spec = [ | |
{ | |
"name": "calculate_cosine_similarity", | |
"description": "Calculate the cosine similarity between two strings.", | |
"parameters": { | |
"type": "object", | |
"properties": { | |
"query": { | |
"type": "string", | |
"description": "The main query string for similarity calculation" | |
}, | |
"job_title": { | |
"type": "string", | |
"description": "The job title to compare with the query" | |
} | |
}, | |
"required": ["query", "job_title"] | |
} | |
} | |
] | |
functions = json.dumps(function_spec, indent=4) | |
# Crear pipeline para generaci贸n de texto con Fireworks | |
fireworks_pipeline = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
max_new_tokens=50 # Reducir max_new_tokens para acelerar | |
) | |
# Adaptar el pipeline a LangChain | |
llm_pipeline = HuggingFacePipeline(pipeline=fireworks_pipeline) | |
# Interfaz de Streamlit | |
st.title("Cosine Similarity Calculation with Fireworks, LangChain, and Llama 3.1") | |
# Subir archivo CSV | |
uploaded_file = st.file_uploader("Sube un archivo CSV con la columna 'job_title':", type=["csv"]) | |
if uploaded_file is not None: | |
# Cargar el CSV en un DataFrame | |
df = pd.read_csv(uploaded_file) | |
if 'job_title' in df.columns: | |
query = 'aspiring human resources specialist' | |
job_titles = df['job_title'].tolist() | |
# Procesar en lotes para optimizaci贸n | |
batch_size = 16 # Ajusta seg煤n la memoria de la GPU | |
job_titles_batches = [job_titles[i:i+batch_size] for i in range(0, len(job_titles), batch_size)] | |
# Definir el prompt para Fireworks | |
prompt_template = PromptTemplate( | |
template=( | |
"Calculate the cosine similarity between the query: '{query}' " | |
"and the list of job titles: {job_titles}. " | |
"Return the results as 'Job Title: [Job Title], Score: [Cosine Similarity Score]'." | |
), | |
input_variables=["query", "job_titles"] | |
) | |
# Crear el LLMChain para manejar la interacci贸n con Fireworks | |
llm_chain = LLMChain( | |
llm=llm_pipeline, | |
prompt=prompt_template | |
) | |
# Ejecutar la generaci贸n con Fireworks y funciones | |
if st.button("Calcular Similitud de Coseno"): | |
with st.spinner("Calculando similitudes con Fireworks..."): | |
all_scores = [] | |
try: | |
for batch in job_titles_batches: | |
# Tokenizar la entrada con atenci贸n en lotes | |
model_inputs = tokenizer( | |
batch, | |
return_tensors="pt", | |
padding=True, | |
truncation=True | |
).to(model.device) | |
# A帽adir atenci贸n y ejecutar la generaci贸n en lotes | |
with torch.cuda.amp.autocast(): # Mixed Precision para m谩s velocidad | |
model_inputs['attention_mask'] = (model_inputs['input_ids'] != tokenizer.pad_token_id).int() | |
generated_ids = model.generate( | |
**model_inputs, | |
max_new_tokens=50, | |
num_beams=1 # Desactivar b煤squeda en beam para m谩s velocidad | |
) | |
# Decodificar el resultado y a帽adirlo a la lista de resultados | |
decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) | |
all_scores.extend([0.95] * len(batch)) # Simulaci贸n para demostraci贸n | |
# Asignar puntajes al DataFrame | |
df['Score'] = all_scores | |
# Mostrar el dataframe actualizado | |
st.write("DataFrame con los puntajes de similitud:") | |
st.write(df) | |
except Exception as e: | |
st.error(f"Error durante la generaci贸n: {e}") | |
else: | |
st.error("La columna 'job_title' no se encuentra en el archivo CSV.") | |
''' | |
import pandas as pd | |
import streamlit as st | |
from langchain.llms import HuggingFacePipeline | |
from langchain_core.prompts import PromptTemplate | |
from langchain.chains import LLMChain | |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
from huggingface_hub import login | |
import torch | |
# API Key de Hugging Face | |
huggingface_token = st.secrets["FIREWORKS"] | |
login(huggingface_token)# Autenticar | |
#login(api_key) | |
# Configurar modelo Llama 3.1 | |
model_id = "meta-llama/Llama-3.2-1B" | |
tokenizer = AutoTokenizer.from_pretrained(model_id, truncation=True) | |
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16) | |
# Crear pipeline con Fireworks | |
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=50) #, max_length=1024) | |
llm_pipeline = HuggingFacePipeline(pipeline=pipe) | |
# Interfaz de Streamlit | |
st.title("Cosine Similarity Calculation with Fireworks, LangChain, and Llama 3.1") | |
# Subir archivo CSV | |
uploaded_file = st.file_uploader("Sube un archivo CSV con la columna 'job_title':", type=["csv"]) | |
if uploaded_file is not None: | |
# Cargar el CSV en un DataFrame | |
df = pd.read_csv(uploaded_file) | |
print(df) | |
if 'job_title' in df.columns: | |
query = 'aspiring human resources specialist' | |
job_titles = df['job_title'].tolist() | |
# Definir el prompt para usar Fireworks para c谩lculo de similitud de coseno | |
# Crear el prompt mejorado para Fireworks | |
prompt_template = PromptTemplate( | |
template=( | |
"You are an AI model with access to external embeddings services. Your task is to calculate the cosine similarity " | |
"between a given query and a list of job titles using embeddings obtained from an external service. " | |
"Follow these steps to complete the task:\n\n" | |
"1. Retrieve the embeddings for the query: '{query}' from the external embeddings service.\n" | |
"2. For each job title in the list below, retrieve the corresponding embeddings from the same external service.\n" | |
"3. Calculate the cosine similarity between the query embeddings and the embeddings of each job title.\n" | |
"4. Return the results in the following format:\n" | |
" - Job Title: [Job Title], Score: [Cosine Similarity Score]\n" | |
" - Job Title: [Job Title], Score: [Cosine Similarity Score]\n" | |
" ...\n\n" | |
"The list of job titles is:\n{job_titles}\n\n" | |
"Remember to access the embeddings service directly and ensure that the cosine similarity scores are calculated accurately based on the semantic similarity between the embeddings." | |
), | |
input_variables=["query", "job_titles"] | |
) | |
# Crear el LLMChain para manejar la interacci贸n con Fireworks | |
llm_chain = LLMChain( | |
llm=llm_pipeline, | |
prompt=prompt_template | |
) | |
# Ejecutar la generaci贸n con el LLM | |
if st.button("Calcular Similitud de Coseno"): | |
with st.spinner("Calculando similitudes con Fireworks y Llama 3.1..."): | |
try: | |
result = llm_chain.run({"query": query, "job_titles": job_titles}) | |
st.write("Respuesta del modelo:") | |
st.write(result) | |
# Simular la asignaci贸n de puntajes en la columna 'Score' (basado en la respuesta del modelo) | |
df['Score'] = [0.95] * len(df) # Simulaci贸n para la demostraci贸n | |
# Mostrar el dataframe actualizado | |
st.write("DataFrame con los puntajes de similitud:") | |
st.write(df) | |
except Exception as e: | |
st.error(f"Error durante la generaci贸n: {e}") | |
else: | |
st.error("La columna 'job_title' no se encuentra en el archivo CSV.") | |
''' |