zhtet commited on
Commit
183c15c
1 Parent(s): 5bf0c23

Update models/llamaCustom.py

Browse files
Files changed (1) hide show
  1. models/llamaCustom.py +40 -85
models/llamaCustom.py CHANGED
@@ -6,9 +6,9 @@ from typing import Any, List, Mapping, Optional
6
  import numpy as np
7
  import openai
8
  import pandas as pd
9
- import streamlit as st
10
  from dotenv import load_dotenv
11
- from huggingface_hub import HfFileSystem, Repository
 
12
  from llama_index import (
13
  Document,
14
  GPTVectorStoreIndex,
@@ -19,17 +19,12 @@ from llama_index import (
19
  StorageContext,
20
  load_index_from_storage,
21
  )
22
- from llama_index.llms import CompletionResponse, CustomLLM, LLMMetadata
23
-
24
- # from langchain.llms.base import LLM
25
- # from llama_index.prompts import Prompt
26
- from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, pipeline
27
 
28
  # from utils.customLLM import CustomLLM
29
 
30
  load_dotenv()
31
  # openai.api_key = os.getenv("OPENAI_API_KEY")
32
-
33
  fs = HfFileSystem()
34
 
35
  # define prompt helper
@@ -38,98 +33,62 @@ CONTEXT_WINDOW = 2048
38
  # set number of output tokens
39
  NUM_OUTPUT = 525
40
  # set maximum chunk overlap
41
- CHUNK_OVERLAP_RATIO = 0.2
42
 
43
  prompt_helper = PromptHelper(
44
  context_window=CONTEXT_WINDOW,
45
  num_output=NUM_OUTPUT,
46
- chunk_overlap_ratio=CHUNK_OVERLAP_RATIO,
47
  )
48
 
49
-
50
- @st.cache_resource
51
- def load_model(mode_name: str):
52
- # llm_model_name = "bigscience/bloom-560m"
53
- tokenizer = AutoTokenizer.from_pretrained(mode_name)
54
- model = AutoModelForCausalLM.from_pretrained(mode_name, config="T5Config")
55
-
56
- pipe = pipeline(
57
- task="text-generation",
58
- model=model,
59
- tokenizer=tokenizer,
60
- # device=0, # GPU device number
61
- # max_length=512,
62
- do_sample=True,
63
- top_p=0.95,
64
- top_k=50,
65
- temperature=0.7,
66
- )
67
-
68
- return pipe
69
 
70
 
71
- class OurLLM(CustomLLM):
72
- def __init__(self, model_name: str, model_pipeline):
73
- self.model_name = model_name
74
- self.pipeline = model_pipeline
75
 
76
- @property
77
- def metadata(self) -> LLMMetadata:
78
- """Get LLM metadata."""
79
- return LLMMetadata(
80
- context_window=CONTEXT_WINDOW,
81
- num_output=NUM_OUTPUT,
82
- model_name=self.model_name,
83
- )
84
-
85
- def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
86
  prompt_length = len(prompt)
87
- response = self.pipeline(prompt, max_new_tokens=NUM_OUTPUT)[0]["generated_text"]
88
 
89
  # only return newly generated tokens
90
- text = response[prompt_length:]
91
- return CompletionResponse(text=text)
92
-
93
- def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
94
- raise NotImplementedError()
95
 
96
- # def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
97
- # prompt_length = len(prompt)
98
- # response = self.pipeline(prompt, max_new_tokens=525)[0]["generated_text"]
99
-
100
- # # only return newly generated tokens
101
- # return response[prompt_length:]
102
-
103
- # @property
104
- # def _identifying_params(self) -> Mapping[str, Any]:
105
- # return {"name_of_model": self.model_name}
106
 
107
- # @property
108
- # def _llm_type(self) -> str:
109
- # return "custom"
110
 
111
 
112
- @st.cache_resource
113
  class LlamaCustom:
114
  # define llm
115
- # llm_predictor = LLMPredictor(llm=OurLLM())
116
- # service_context = ServiceContext.from_defaults(
117
- # llm_predictor=llm_predictor, prompt_helper=prompt_helper
118
- # )
119
-
120
- def __init__(self, model_name: str) -> None:
121
- pipe = load_model(mode_name=model_name)
122
- llm = OurLLM(model_name=model_name, model_pipeline=pipe)
123
- self.service_context = ServiceContext.from_defaults(
124
- llm=llm, prompt_helper=prompt_helper
125
- )
126
- self.vector_index = self.initialize_index(model_name=model_name)
127
-
128
- def initialize_index(self, model_name: str):
129
- index_name = model_name.split("/")[-1]
130
 
131
- file_path = f"./vectorStores/{index_name}"
 
132
 
 
 
133
  if os.path.exists(path=file_path):
134
  # rebuild storage context
135
  storage_context = StorageContext.from_defaults(persist_dir=file_path)
@@ -160,9 +119,5 @@ class LlamaCustom:
160
  def get_response(self, query_str):
161
  print("query_str: ", query_str)
162
  query_engine = self.vector_index.as_query_engine()
163
- # query_engine = self.vector_index.as_query_engine(
164
- # text_qa_template=text_qa_template, refine_template=refine_template
165
- # )
166
  response = query_engine.query(query_str)
167
- print("metadata: ", response.metadata)
168
- return str(response)
 
6
  import numpy as np
7
  import openai
8
  import pandas as pd
 
9
  from dotenv import load_dotenv
10
+ from huggingface_hub import HfFileSystem
11
+ from langchain.llms.base import LLM
12
  from llama_index import (
13
  Document,
14
  GPTVectorStoreIndex,
 
19
  StorageContext,
20
  load_index_from_storage,
21
  )
22
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
 
 
 
23
 
24
  # from utils.customLLM import CustomLLM
25
 
26
  load_dotenv()
27
  # openai.api_key = os.getenv("OPENAI_API_KEY")
 
28
  fs = HfFileSystem()
29
 
30
  # define prompt helper
 
33
  # set number of output tokens
34
  NUM_OUTPUT = 525
35
  # set maximum chunk overlap
36
+ CHUNK_OVERLAP_RATION = 0.2
37
 
38
  prompt_helper = PromptHelper(
39
  context_window=CONTEXT_WINDOW,
40
  num_output=NUM_OUTPUT,
41
+ chunk_overlap_ratio=CHUNK_OVERLAP_RATION,
42
  )
43
 
44
+ llm_model_name = "bigscience/bloom-560m"
45
+ tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
46
+ model = AutoModelForCausalLM.from_pretrained(llm_model_name, config="T5Config")
47
+
48
+ model_pipeline = pipeline(
49
+ model=model,
50
+ tokenizer=tokenizer,
51
+ task="text-generation",
52
+ # device=0, # GPU device number
53
+ # max_length=512,
54
+ do_sample=True,
55
+ top_p=0.95,
56
+ top_k=50,
57
+ temperature=0.7,
58
+ )
 
 
 
 
 
59
 
60
 
61
+ class CustomLLM(LLM):
62
+ pipeline = model_pipeline
 
 
63
 
64
+ def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
 
 
 
 
 
 
 
 
 
65
  prompt_length = len(prompt)
66
+ response = self.pipeline(prompt, max_new_tokens=525)[0]["generated_text"]
67
 
68
  # only return newly generated tokens
69
+ return response[prompt_length:]
 
 
 
 
70
 
71
+ @property
72
+ def _identifying_params(self) -> Mapping[str, Any]:
73
+ return {"name_of_model": self.model_name}
 
 
 
 
 
 
 
74
 
75
+ @property
76
+ def _llm_type(self) -> str:
77
+ return "custom"
78
 
79
 
 
80
  class LlamaCustom:
81
  # define llm
82
+ llm_predictor = LLMPredictor(llm=CustomLLM())
83
+ service_context = ServiceContext.from_defaults(
84
+ llm_predictor=llm_predictor, prompt_helper=prompt_helper
85
+ )
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ def __init__(self, name: str) -> None:
88
+ self.vector_index = self.initialize_index(index_name=name)
89
 
90
+ def initialize_index(self, index_name):
91
+ file_path = f"./vectorStores/{index_name}"
92
  if os.path.exists(path=file_path):
93
  # rebuild storage context
94
  storage_context = StorageContext.from_defaults(persist_dir=file_path)
 
119
  def get_response(self, query_str):
120
  print("query_str: ", query_str)
121
  query_engine = self.vector_index.as_query_engine()
 
 
 
122
  response = query_engine.query(query_str)
123
+ return str(response)