File size: 9,255 Bytes
235f923
 
3e53afe
235f923
 
3e53afe
235f923
 
 
 
 
 
 
 
 
3e53afe
 
 
 
 
 
235f923
 
 
 
 
3e53afe
 
235f923
 
3e53afe
 
 
 
235f923
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e53afe
235f923
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e53afe
 
 
 
235f923
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e53afe
235f923
3e53afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235f923
 
 
 
 
 
 
 
 
 
3e53afe
235f923
 
 
f762e1b
cf47d83
f762e1b
359ae4f
 
2c1b805
69e3a41
3cb87ba
359ae4f
bf1271a
3759c5d
6f08d4d
136885b
85ec4d4
5e5f699
2c1b805
a71097f
93e26c1
6f08d4d
70ed6f0
2c1b805
af9e54e
f762e1b
a3bc7ec
f762e1b
2c1b805
763be08
f762e1b
 
a3bc7ec
f762e1b
 
 
2a1126f
ea3c34e
f762e1b
c2b4dad
f762e1b
105c4c8
2c1b805
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d37a28d
2c1b805
 
 
 
 
763be08
2c1b805
 
 
f762e1b
2c1b805
f762e1b
2c1b805
2b5a681
d37a28d
 
2b5a681
f762e1b
 
 
 
 
 
 
235f923
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import pandas as pd
import streamlit as st
from langchain_huggingface import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from huggingface_hub import login
import torch
import json
from datetime import datetime

# Autenticaci贸n con Fireworks en Hugging Face
huggingface_token = st.secrets["FIREWORKS"]
login(huggingface_token)

# Configurar modelo Fireworks con cuantizaci贸n int8
quant_config = BitsAndBytesConfig.from_model_type(
    "int8",  # Cuantizaci贸n para reducir el tama帽o y acelerar
    quantization_scheme="gptq"
)

model_id = "fireworks-ai/firefunction-v2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    device_map="auto", 
    torch_dtype=torch.float16,
    quantization_config=quant_config
)

# Establecer el token de relleno
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Definir funciones espec铆ficas para Fireworks
function_spec = [
    {
        "name": "calculate_cosine_similarity",
        "description": "Calculate the cosine similarity between two strings.",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The main query string for similarity calculation"
                },
                "job_title": {
                    "type": "string",
                    "description": "The job title to compare with the query"
                }
            },
            "required": ["query", "job_title"]
        }
    }
]
functions = json.dumps(function_spec, indent=4)

# Crear pipeline para generaci贸n de texto con Fireworks
fireworks_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=50  # Reducir max_new_tokens para acelerar
)

# Adaptar el pipeline a LangChain
llm_pipeline = HuggingFacePipeline(pipeline=fireworks_pipeline)

# Interfaz de Streamlit
st.title("Cosine Similarity Calculation with Fireworks, LangChain, and Llama 3.1")

# Subir archivo CSV
uploaded_file = st.file_uploader("Sube un archivo CSV con la columna 'job_title':", type=["csv"])

if uploaded_file is not None:
    # Cargar el CSV en un DataFrame
    df = pd.read_csv(uploaded_file)
    if 'job_title' in df.columns:
        query = 'aspiring human resources specialist'
        job_titles = df['job_title'].tolist()

        # Procesar en lotes para optimizaci贸n
        batch_size = 16  # Ajusta seg煤n la memoria de la GPU
        job_titles_batches = [job_titles[i:i+batch_size] for i in range(0, len(job_titles), batch_size)]

        # Definir el prompt para Fireworks
        prompt_template = PromptTemplate(
            template=(
                "Calculate the cosine similarity between the query: '{query}' "
                "and the list of job titles: {job_titles}. "
                "Return the results as 'Job Title: [Job Title], Score: [Cosine Similarity Score]'."
            ),
            input_variables=["query", "job_titles"]
        )

        # Crear el LLMChain para manejar la interacci贸n con Fireworks
        llm_chain = LLMChain(
            llm=llm_pipeline,
            prompt=prompt_template
        )

        # Ejecutar la generaci贸n con Fireworks y funciones
        if st.button("Calcular Similitud de Coseno"):
            with st.spinner("Calculando similitudes con Fireworks..."):
                all_scores = []
                try:
                    for batch in job_titles_batches:
                        # Tokenizar la entrada con atenci贸n en lotes
                        model_inputs = tokenizer(
                            batch,
                            return_tensors="pt",
                            padding=True,
                            truncation=True
                        ).to(model.device)

                        # A帽adir atenci贸n y ejecutar la generaci贸n en lotes
                        with torch.cuda.amp.autocast():  # Mixed Precision para m谩s velocidad
                            model_inputs['attention_mask'] = (model_inputs['input_ids'] != tokenizer.pad_token_id).int()
                            generated_ids = model.generate(
                                **model_inputs, 
                                max_new_tokens=50, 
                                num_beams=1  # Desactivar b煤squeda en beam para m谩s velocidad
                            )
                        
                        # Decodificar el resultado y a帽adirlo a la lista de resultados
                        decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
                        all_scores.extend([0.95] * len(batch))  # Simulaci贸n para demostraci贸n
                    
                    # Asignar puntajes al DataFrame
                    df['Score'] = all_scores

                    # Mostrar el dataframe actualizado
                    st.write("DataFrame con los puntajes de similitud:")
                    st.write(df)
                except Exception as e:
                    st.error(f"Error durante la generaci贸n: {e}")
    else:
        st.error("La columna 'job_title' no se encuentra en el archivo CSV.")



'''


import pandas as pd
import streamlit as st
from langchain.llms import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from huggingface_hub import login
import torch

# API Key de Hugging Face
huggingface_token = st.secrets["FIREWORKS"]
login(huggingface_token)# Autenticar
#login(api_key)


# Configurar modelo Llama 3.1
model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id, truncation=True)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16)

# Crear pipeline con Fireworks
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=50) #, max_length=1024)
llm_pipeline = HuggingFacePipeline(pipeline=pipe)

# Interfaz de Streamlit
st.title("Cosine Similarity Calculation with Fireworks, LangChain, and Llama 3.1")

# Subir archivo CSV
uploaded_file = st.file_uploader("Sube un archivo CSV con la columna 'job_title':", type=["csv"])

if uploaded_file is not None:
    # Cargar el CSV en un DataFrame
    df = pd.read_csv(uploaded_file)
    print(df)
    
    if 'job_title' in df.columns:
        query = 'aspiring human resources specialist'
        job_titles = df['job_title'].tolist()

        # Definir el prompt para usar Fireworks para c谩lculo de similitud de coseno
        # Crear el prompt mejorado para Fireworks
        prompt_template = PromptTemplate(
        template=(
            "You are an AI model with access to external embeddings services. Your task is to calculate the cosine similarity "
            "between a given query and a list of job titles using embeddings obtained from an external service. "
            "Follow these steps to complete the task:\n\n"
            "1. Retrieve the embeddings for the query: '{query}' from the external embeddings service.\n"
            "2. For each job title in the list below, retrieve the corresponding embeddings from the same external service.\n"
            "3. Calculate the cosine similarity between the query embeddings and the embeddings of each job title.\n"
            "4. Return the results in the following format:\n"
            "   - Job Title: [Job Title], Score: [Cosine Similarity Score]\n"
            "   - Job Title: [Job Title], Score: [Cosine Similarity Score]\n"
            "   ...\n\n"
            "The list of job titles is:\n{job_titles}\n\n"
            "Remember to access the embeddings service directly and ensure that the cosine similarity scores are calculated accurately based on the semantic similarity between the embeddings."
        ),
    input_variables=["query", "job_titles"]
)

        # Crear el LLMChain para manejar la interacci贸n con Fireworks
        llm_chain = LLMChain(
            llm=llm_pipeline,
            prompt=prompt_template
        )

        # Ejecutar la generaci贸n con el LLM
        if st.button("Calcular Similitud de Coseno"):
            with st.spinner("Calculando similitudes con Fireworks y Llama 3.1..."):
                try:
                    result = llm_chain.run({"query": query, "job_titles": job_titles})
                    st.write("Respuesta del modelo:")
                    st.write(result)

                    # Simular la asignaci贸n de puntajes en la columna 'Score' (basado en la respuesta del modelo)
                    df['Score'] = [0.95] * len(df)  # Simulaci贸n para la demostraci贸n

                    # Mostrar el dataframe actualizado
                    st.write("DataFrame con los puntajes de similitud:")
                    st.write(df)
                except Exception as e:
                    st.error(f"Error durante la generaci贸n: {e}")
    else:
        st.error("La columna 'job_title' no se encuentra en el archivo CSV.")
'''