Spaces:
Runtime error
Runtime error
File size: 13,345 Bytes
94ffe87 d5496a8 94ffe87 d5496a8 94ffe87 d5496a8 94ffe87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 |
from datasets import load_dataset
# Load 70% of the Wikipedia dataset
# dataset = load_dataset('wikimedia/wikipedia', "20231101.en", split='train[:70%]')
dataset = load_dataset('lucadiliello/wikipedia_512_pretraining',split = 'train[:70%]')
# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# # Define the quantization configuration for 4-bit
# quantization_config = BitsAndBytesConfig(
# load_in_4bit=True, # Enable 4-bit precision
# bnb_4bit_quant_type="nf4", # Use the NF4 quantization type (good for reducing memory)
# bnb_4bit_use_double_quant=True, # Enables double quantization to improve accuracy
# bnb_4bit_compute_dtype="float16" # Use float16 for faster computation
# )
# # Load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
# # Load the model with the quantization configuration
# model = AutoModelForCausalLM.from_pretrained(
# 'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
# quantization_config=quantization_config, # Apply the 4-bit quantization config
# device_map='auto' # Automatically map model to available devices (e.g., GPU/CPU)
# )
# # Enable gradient checkpointing to reduce memory usage during training
# model.gradient_checkpointing_enable()
########################################################### gpt2 ####################################################
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# Define the quantization configuration for 4-bit
quantization_config = BitsAndBytesConfig(
load_in_4bit=True, # Enable 4-bit precision
bnb_4bit_quant_type="nf4", # Use the NF4 quantization type (good for reducing memory)
bnb_4bit_use_double_quant=True, # Enables double quantization to improve accuracy
bnb_4bit_compute_dtype="float16" # Use float16 for faster computation
)
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
# Load the model with the quantization configuration
model = AutoModelForCausalLM.from_pretrained(
'gpt2',
quantization_config=quantization_config, # Apply the 4-bit quantization config
device_map='auto' # Automatically map model to available devices (e.g., GPU/CPU)
)
# Enable gradient checkpointing to reduce memory usage during training
model.gradient_checkpointing_enable()
from peft import LoraConfig, get_peft_model
import bitsandbytes as bnb
# Configure PEFT with 4-bit precision
# lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")
lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["attn.c_attn", "mlp.c_fc", "mlp.c_proj"], lora_dropout=0.05, bias="none")
peft_model = get_peft_model(model, lora_config)
# Set the pad token (using eos_token or adding a new special token)
if tokenizer.pad_token is None:
# Option 1: Use eos_token as pad_token
tokenizer.pad_token = tokenizer.eos_token
# Option 2: Add [PAD] as a new pad token if needed
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Tokenize the dataset with optimized settings
def tokenize_function(examples):
return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=150)
tokenized_dataset = dataset.select(range(100000)).map(tokenize_function, batched=True)
def prepare_labels(batch):
batch["labels"] = batch["input_ids"].copy() # Copy input_ids as labels for language modeling
return batch
# Apply the transformation to add labels
tokenized_dataset = tokenized_dataset.map(prepare_labels, batched=True)
# Step 1: Install FAISS for the Vector Database
from datasets import Dataset
from transformers import AutoModel, AutoTokenizer
import faiss
import numpy as np
from tqdm import tqdm # Import tqdm for progress bar
# Load your tokenizer and model
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
embedding_model = AutoModel.from_pretrained(embedding_model_name)
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_model.to(device)
# Function to generate embeddings in batches
def embed_text_batch(texts, batch_size=16):
all_embeddings = []
for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
batch_texts = texts[i:i + batch_size]
# Tokenize and move inputs to the GPU
inputs = embedding_tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)
with torch.no_grad():
# Generate embeddings and move them back to CPU
embeddings = embedding_model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy() # Mean pooling
all_embeddings.extend(embeddings)
return np.array(all_embeddings)
# Step 1: Process the dataset in batches
texts = tokenized_dataset["text"]
batch_size = 16 # Adjust based on Colab memory
embeddings = embed_text_batch(texts, batch_size=batch_size)
# Step 2: Add embeddings as a new column to the dataset
tokenized_dataset = tokenized_dataset.add_column("embeddings", embeddings.tolist())
# Step 3: Add FAISS index
dimension = embeddings.shape[1] # Dimension of embeddings
faiss_index = faiss.IndexFlatL2(dimension)
# Step 4: Add embeddings to FAISS index
faiss_index.add(embeddings)
# Step 5: Save the dataset and FAISS index
tokenized_dataset.save_to_disk("wikipedia_dataset_with_embeddings")
faiss.write_index(faiss_index, "wikipedia_faiss.index")
print("FAISS index and dataset saved successfully.")
def embed_query(query):
# Tokenize and embed the query
inputs = embedding_tokenizer([query], padding=True, truncation=True, return_tensors="pt").to(device)
with torch.no_grad():
query_embedding = embedding_model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
return query_embedding
def search_faiss(query_embedding, faiss_index, top_k=5):
# Search the FAISS index
distances, indices = faiss_index.search(query_embedding, top_k)
return distances, indices
def get_top_answer(indices, dataset):
# Retrieve the top answer(s) from the dataset based on the indices
return dataset["text"][indices[0][0]] # Assuming top result, can adjust for more answers
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import faiss
import numpy as np
# Assuming embeddings and faiss_index are already created as in your previous code
# Load the pre-trained LLM for generation (you can replace it with a different one)
llm_model_name = "facebook/bart-large-cnn" # Example: You can use GPT-3, BART, T5, etc.
llm_model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name)
llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
llm_model.to(device)
# Embedding model used for creating the vector database (same as the one used to generate embeddings for dataset)
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name)
embedding_model.to(device)
# Function to embed a query (same as before)
def embed_query(query):
inputs = embedding_tokenizer([query], padding=True, truncation=True, return_tensors="pt").to(device)
with torch.no_grad():
query_embedding = embedding_model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
return query_embedding
# Function to search FAISS index and retrieve top k results
def search_faiss(query_embedding, faiss_index, top_k=5):
distances, indices = faiss_index.search(query_embedding, top_k)
return distances, indices
# Function to generate an answer using the LLM based on the retrieved documents
def generate_answer(query, retrieved_texts):
# Combine the query and the retrieved texts into a single input
context = " ".join(retrieved_texts)
input_text = f"Question: {query}\nContext: {context}\nAnswer:"
# Tokenize and pass to the LLM
inputs = llm_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
with torch.no_grad():
generated_ids = llm_model.generate(inputs['input_ids'], max_length=150)
# Decode the generated response
answer = llm_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
return answer
# Function to retrieve the texts from the dataset based on FAISS index results
def get_retrieved_texts(indices, dataset, top_k=5):
retrieved_texts = []
for idx in indices[0][:top_k]: # Get the top K results
retrieved_texts.append(dataset['text'][idx]) # Assuming 'text' is the relevant field in the dataset
return retrieved_texts
# Example usage
def rag_pipeline(question, faiss_index, dataset, top_k=3):
# Step 1: Embed the query
query_embedding = embed_query(question)
# Step 2: Search the FAISS index for the top K similar documents
distances, indices = search_faiss(query_embedding, faiss_index, top_k=top_k)
# Step 3: Retrieve the top K relevant documents from the dataset
retrieved_texts = get_retrieved_texts(indices, dataset, top_k=top_k)
# Step 4: Generate the answer using the retrieved texts and the LLM
answer = generate_answer(question, retrieved_texts)
return answer
# Import the necessary modules
from langchain_community.llms import Ollama
# Load the Ollama model
gen_model = Ollama(model="llama2")
# Define a function to get predefined responses for specific queries
def get_predefined_response(question):
predefined_responses = {
"hi": "Hello! How can I assist you today?",
"hello": "Hi there! 😊 What can I help you with?",
"who made you?": "I was created by Vinmay and his team.",
"what is your purpose?": "I'm here to assist you with educational queries and provide information.",
# Add more predefined responses as needed
}
# Normalize the question to make it case insensitive
normalized_question = question.lower()
return predefined_responses.get(normalized_question, None)
# Modify the generate_response function to check for predefined responses
def generate_response(markdown, question, user_instructions=None, max_new_tokens=250, temperature=0.9, top_p=0.95):
# Check for predefined response first
predefined_response = get_predefined_response(question)
if predefined_response:
return predefined_response
instruction_text = f" Please follow these instructions: {user_instructions}" if user_instructions else ""
prompt = (
f"Using the provided context, please generate a unique and insightful answer that directly addresses the question:\n\n"
f"Context:\n{markdown}\n\n"
f"Question: {question}\n"
f"{instruction_text}\n"
f"If any personal query asked then refer{predefined_response}\n and based upon it, genarate your own answer"
f"Please synthesize your response by integrating the information with your own understanding: "
)
# Call the Ollama model using the `invoke` method
response = gen_model.invoke(prompt, max_tokens=max_new_tokens, temperature=temperature, top_p=top_p)
# Check if the response is a string (direct generated text) or a dictionary (with metadata)
if isinstance(response, str):
return response # Return the raw text if it's a string
elif isinstance(response, dict) and "choices" in response:
return response["choices"][0]["text"] # Extract the text from the structured response
else:
return "Unexpected response format."
# # Example usage
# markdown = "The sky appears blue due to the scattering of light by the atmosphere."
# question = "Hi"
# response = generate_response(markdown, question)
# print(f"Model Response: {response}")
import gradio as gr
from langchain_community.llms import Ollama
# Load the Ollama model
gen_model = Ollama(model="llama2")
# Define the manual responses
manual_responses = {
"hi": "Hello! How can I assist you today?",
"hello": "Hi there! What would you like to know?",
"who made you?": "I was created by OpenAI.",
"what is your purpose?": "I'm here to assist with educational queries!"
}
# Function to generate responses
def generate_response(user_input):
# Normalize user input for matching
normalized_input = user_input.lower().strip()
# Check for manual responses
if normalized_input in manual_responses:
return manual_responses[normalized_input]
# For other questions, generate a response using the model
prompt = f"Please provide a detailed answer to the following question:\n\nQuestion: {user_input}\n"
response = gen_model.invoke(prompt)
return response.strip()
# Create the Gradio interface
iface = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(label="Ask a Question"),
outputs=gr.Textbox(label="Response"),
title="Q&A System",
description="Ask me anything and I will respond accordingly."
)
# Launch the Gradio app
if __name__ == "__main__":
iface.launch(share=True, inline = False) # Use share=True to make it public if needed
|