Spaces:
Runtime error
Runtime error
File size: 2,519 Bytes
0809507 38bc9e2 0809507 ef2a3f4 0809507 38bc9e2 0809507 3b7cf58 0809507 1230ae3 0809507 1230ae3 0809507 ef2a3f4 3b7cf58 ef2a3f4 0809507 3b7cf58 0809507 38bc9e2 0809507 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import os
from json import dumps, loads
import numpy as np
import openai
import pandas as pd
from dotenv import load_dotenv
from llama_index import (Document, GPTVectorStoreIndex, LLMPredictor,
PromptHelper, ServiceContext, StorageContext,
load_index_from_storage)
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from utils.customLLM import CustomLLM
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
# get model
# model_name = "bigscience/bloom-560m"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, config='T5Config')
# define prompt helper
# set maximum input size
context_window = 2048
# set number of output tokens
num_output = 525
# set maximum chunk overlap
chunk_overlap_ratio = 0.2
prompt_helper = PromptHelper(context_window, num_output, chunk_overlap_ratio)
# create a pipeline
# pl = pipeline(
# model=model,
# tokenizer=tokenizer,
# task="text-generation",
# # device=0, # GPU device number
# # max_length=512,
# do_sample=True,
# top_p=0.95,
# top_k=50,
# temperature=0.7
# )
# define llm
llm_predictor = LLMPredictor(llm=CustomLLM())
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)
def prepare_data(file_path:str):
df = pd.read_json(file_path)
df = df.replace(to_replace="", value=np.nan).dropna(axis=0) # remove null values
parsed = loads(df.to_json(orient="records"))
documents = []
for item in parsed:
document = Document(item['paragraphText'],
item['_id']['$oid'],
extra_info={"chapter": item['chapter'],
"article": item['article'],
"title": item['title']})
documents.append(document)
return documents
def initialize_index(index_name):
file_path = f"./vectorStores/{index_name}"
if os.path.exists(file_path):
# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir=file_path)
# load index
index = load_index_from_storage(storage_context)
return index
else:
documents = prepare_data(r"./assets/regItems.json")
index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)
index.storage_context.persist(file_path)
return index |