import os import pickle from json import dumps, loads import numpy as np import openai import pandas as pd from dotenv import load_dotenv from huggingface_hub import HfFileSystem from llama_index import ( Document, GPTVectorStoreIndex, LLMPredictor, PromptHelper, ServiceContext, StorageContext, load_index_from_storage, ) from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from utils.customLLM import CustomLLM load_dotenv() openai.api_key = os.getenv("OPENAI_API_KEY") fs = HfFileSystem() # get model # model_name = "bigscience/bloom-560m" # tokenizer = AutoTokenizer.from_pretrained(model_name) # model = AutoModelForCausalLM.from_pretrained(model_name, config='T5Config') # define prompt helper # set maximum input size context_window = 2048 # set number of output tokens num_output = 525 # set maximum chunk overlap chunk_overlap_ratio = 0.2 prompt_helper = PromptHelper(context_window, num_output, chunk_overlap_ratio) # create a pipeline # pl = pipeline( # model=model, # tokenizer=tokenizer, # task="text-generation", # # device=0, # GPU device number # # max_length=512, # do_sample=True, # top_p=0.95, # top_k=50, # temperature=0.7 # ) # define llm llm_predictor = LLMPredictor(llm=CustomLLM()) service_context = ServiceContext.from_defaults( llm_predictor=llm_predictor, prompt_helper=prompt_helper ) def prepare_data(file_path: str): df = pd.read_json(file_path) df = df.replace(to_replace="", value=np.nan).dropna(axis=0) # remove null values parsed = loads(df.to_json(orient="records")) documents = [] for item in parsed: document = Document( item["paragraphText"], item["_id"]["$oid"], extra_info={ "chapter": item["chapter"], "article": item["article"], "title": item["title"], }, ) documents.append(document) return documents def initialize_index(index_name): file_path = f"./vectorStores/{index_name}" if os.path.exists(file_path): # rebuild storage context storage_context = StorageContext.from_defaults(persist_dir=file_path) # local load index access index = load_index_from_storage(storage_context) # huggingface repo load access # with fs.open(file_path, "r") as file: # index = pickle.loads(file.readlines()) return index else: documents = prepare_data(r"./assets/regItems.json") index = GPTVectorStoreIndex.from_documents( documents, service_context=service_context ) # local write access index.storage_context.persist(file_path) # huggingface repo write access # with fs.open(file_path, "w") as file: # file.write(pickle.dumps(index)) return index