RegBot4.0 / models /bloom.py
Zwea Htet
added chat history
38bc9e2
raw
history blame
2.52 kB
import os
from json import dumps, loads
import numpy as np
import openai
import pandas as pd
from dotenv import load_dotenv
from llama_index import (Document, GPTVectorStoreIndex, LLMPredictor,
PromptHelper, ServiceContext, StorageContext,
load_index_from_storage)
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from utils.customLLM import CustomLLM
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
# get model
# model_name = "bigscience/bloom-560m"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, config='T5Config')
# define prompt helper
# set maximum input size
context_window = 2048
# set number of output tokens
num_output = 525
# set maximum chunk overlap
chunk_overlap_ratio = 0.2
prompt_helper = PromptHelper(context_window, num_output, chunk_overlap_ratio)
# create a pipeline
# pl = pipeline(
# model=model,
# tokenizer=tokenizer,
# task="text-generation",
# # device=0, # GPU device number
# # max_length=512,
# do_sample=True,
# top_p=0.95,
# top_k=50,
# temperature=0.7
# )
# define llm
llm_predictor = LLMPredictor(llm=CustomLLM())
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)
def prepare_data(file_path:str):
df = pd.read_json(file_path)
df = df.replace(to_replace="", value=np.nan).dropna(axis=0) # remove null values
parsed = loads(df.to_json(orient="records"))
documents = []
for item in parsed:
document = Document(item['paragraphText'],
item['_id']['$oid'],
extra_info={"chapter": item['chapter'],
"article": item['article'],
"title": item['title']})
documents.append(document)
return documents
def initialize_index(index_name):
file_path = f"./vectorStores/{index_name}"
if os.path.exists(file_path):
# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir=file_path)
# load index
index = load_index_from_storage(storage_context)
return index
else:
documents = prepare_data(r"./assets/regItems.json")
index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)
index.storage_context.persist(file_path)
return index