Spaces:
Sleeping
Sleeping
File size: 3,540 Bytes
19f4fce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import os
import pickle
from json import dumps, loads
from typing import Any, List, Mapping, Optional
import numpy as np
import openai
import pandas as pd
from dotenv import load_dotenv
from huggingface_hub import HfFileSystem
from langchain.llms.base import LLM
from llama_index import (
Document,
GPTVectorStoreIndex,
LLMPredictor,
PromptHelper,
ServiceContext,
SimpleDirectoryReader,
StorageContext,
load_index_from_storage,
)
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
# from utils.customLLM import CustomLLM
load_dotenv()
# openai.api_key = os.getenv("OPENAI_API_KEY")
fs = HfFileSystem()
# define prompt helper
# set maximum input size
CONTEXT_WINDOW = 2048
# set number of output tokens
NUM_OUTPUT = 525
# set maximum chunk overlap
CHUNK_OVERLAP_RATION = 0.2
prompt_helper = PromptHelper(
context_window=CONTEXT_WINDOW,
num_output=NUM_OUTPUT,
chunk_overlap_ratio=CHUNK_OVERLAP_RATION,
)
llm_model_name = "bigscience/bloom-560m"
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
model = AutoModelForCausalLM.from_pretrained(llm_model_name, config="T5Config")
model_pipeline = pipeline(
model=model,
tokenizer=tokenizer,
task="text-generation",
# device=0, # GPU device number
# max_length=512,
do_sample=True,
top_p=0.95,
top_k=50,
temperature=0.7,
)
class CustomLLM(LLM):
pipeline = model_pipeline
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
prompt_length = len(prompt)
response = self.pipeline(prompt, max_new_tokens=525)[0]["generated_text"]
# only return newly generated tokens
return response[prompt_length:]
@property
def _identifying_params(self) -> Mapping[str, Any]:
return {"name_of_model": self.model_name}
@property
def _llm_type(self) -> str:
return "custom"
class LlamaCustom:
# define llm
llm_predictor = LLMPredictor(llm=CustomLLM())
service_context = ServiceContext.from_defaults(
llm_predictor=llm_predictor, prompt_helper=prompt_helper
)
def __init__(self, name: str) -> None:
self.vector_index = self.initialize_index(index_name=name)
def initialize_index(self, index_name):
file_path = f"./vectorStores/{index_name}"
if os.path.exists(path=file_path):
# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir=file_path)
# local load index access
index = load_index_from_storage(storage_context)
# huggingface repo load access
# with fs.open(file_path, "r") as file:
# index = pickle.loads(file.readlines())
return index
else:
# documents = prepare_data(r"./assets/regItems.json")
documents = SimpleDirectoryReader(input_dir="./assets/pdf").load_data()
index = GPTVectorStoreIndex.from_documents(
documents, service_context=self.service_context
)
# local write access
index.storage_context.persist(file_path)
# huggingface repo write access
# with fs.open(file_path, "w") as file:
# file.write(pickle.dumps(index))
return index
def get_response(self, query_str):
print("query_str: ", query_str)
query_engine = self.vector_index.as_query_engine()
response = query_engine.query(query_str)
return str(response)
|