drakosfire commited on
Commit
9bade29
1 Parent(s): c21541c

Basic Logging, timestamp, simple token estimate, source files

Browse files
Files changed (2) hide show
  1. SRD_embeddings.csv +2 -2
  2. app.py +54 -12
SRD_embeddings.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1315c8fc5255c125c06b6c9e3ec4c84df91fd60e03596a363e6d7491df8171ba
3
- size 46149879
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ffdfe9de524d440d57d359270fe8a774009188b528946a79a55f8dd7294e5fe
3
+ size 51272010
app.py CHANGED
@@ -5,6 +5,7 @@ from sentence_transformers import util, SentenceTransformer
5
  import torch
6
  import time
7
  from time import perf_counter as timer
 
8
  import textwrap
9
  import json
10
  import textwrap
@@ -15,10 +16,7 @@ print("Launching")
15
 
16
  client = OpenAI()
17
 
18
- # Define helper function to print wrapped text
19
- def print_wrapped(text, wrap_length=80):
20
- wrapped_text = textwrap.fill(text, wrap_length)
21
- print(wrapped_text)
22
 
23
  # Import saved file and view
24
  embeddings_df_save_path = "./SRD_embeddings.csv"
@@ -39,6 +37,28 @@ pages_and_chunks = text_chunks_and_embedding_df_load.to_dict(orient="records")
39
  # Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
40
  embeddings = torch.tensor(np.array(text_chunks_and_embedding_df_load["embedding"].tolist()), dtype=torch.float32).to('cpu')
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def retrieve_relevant_resources(query: str,
43
  embeddings: torch.tensor,
44
  model: SentenceTransformer=embedding_model,
@@ -84,18 +104,23 @@ def print_top_results_and_scores(query: str,
84
  # Loop through zipped together scores and indicies
85
  for score, index in zip(scores, indices):
86
  print(f"Score: {score:.4f}")
 
87
  # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
88
  print_wrapped(pages_and_chunks[index]["sentence_chunk"])
89
  # Print the page number too so we can reference the textbook further and check the results
90
  print(f"File of Origin: {pages_and_chunks[index]['file_path']}")
91
  print("\n")
92
 
 
 
93
  def prompt_formatter(query: str,
94
  context_items: list[dict]) -> str:
95
- """
96
- Augments query with text-based context from context_items.
97
- """
98
  # Join context items into one dotted paragraph
 
 
 
 
 
99
  context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])
100
 
101
  # Create a base prompt with examples to help the model
@@ -139,23 +164,36 @@ Use the context provided to answer the user's query concisely. """
139
 
140
 
141
  with gr.Blocks() as RulesLawyer:
 
 
 
142
  chatbot = gr.Chatbot()
143
  msg = gr.Textbox()
144
  clear = gr.ClearButton([msg, chatbot])
145
 
 
 
 
 
 
146
  def respond(message, chat_history):
 
 
 
 
147
 
148
  # Get relevant resources
149
- scores, indices = retrieve_relevant_resources(query=message,
150
  embeddings=embeddings)
151
-
152
  # Create a list of context items
153
  context_items = [pages_and_chunks[i] for i in indices]
 
154
 
155
  # Format prompt with context items
156
- prompt = prompt_formatter(query=message,
157
  context_items=context_items)
158
- print(prompt)
159
  bot_message = client.chat.completions.create(
160
  model="gpt-4",
161
  messages=[
@@ -171,9 +209,13 @@ with gr.Blocks() as RulesLawyer:
171
  presence_penalty=0
172
  )
173
  chat_history.append((message, bot_message.choices[0].message.content))
 
 
174
  time.sleep(2)
175
  return "", chat_history
176
- msg.submit(respond, [msg, chatbot], [msg, chatbot])
 
 
177
 
178
  if __name__ == "__main__":
179
  RulesLawyer.launch()
 
5
  import torch
6
  import time
7
  from time import perf_counter as timer
8
+ from datetime import datetime
9
  import textwrap
10
  import json
11
  import textwrap
 
16
 
17
  client = OpenAI()
18
 
19
+
 
 
 
20
 
21
  # Import saved file and view
22
  embeddings_df_save_path = "./SRD_embeddings.csv"
 
37
  # Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
38
  embeddings = torch.tensor(np.array(text_chunks_and_embedding_df_load["embedding"].tolist()), dtype=torch.float32).to('cpu')
39
 
40
+ # Define helper function to print wrapped text
41
+ def print_wrapped(text, wrap_length=80):
42
+ wrapped_text = textwrap.fill(text, wrap_length)
43
+ print(wrapped_text)
44
+
45
+ def hybrid_estimate_tokens(text: str)-> float:
46
+ # Part 1: Estimate based on spaces and punctuation
47
+ estimated_words = text.count(' ') + 1 # Counting words by spaces
48
+ punctuation_count = sum(1 for char in text if char in ',.!?;:') # Counting punctuation as potential separate tokens
49
+ estimate1 = estimated_words + punctuation_count
50
+
51
+ # Part 2: Estimate based on total characters divided by average token length
52
+ average_token_length = 4
53
+ total_characters = len(text)
54
+ estimate2 = (total_characters // average_token_length) + punctuation_count
55
+
56
+ # Average the two estimates
57
+ estimated_tokens = (estimate1 + estimate2) / 2
58
+
59
+ return estimated_tokens
60
+
61
+
62
  def retrieve_relevant_resources(query: str,
63
  embeddings: torch.tensor,
64
  model: SentenceTransformer=embedding_model,
 
104
  # Loop through zipped together scores and indicies
105
  for score, index in zip(scores, indices):
106
  print(f"Score: {score:.4f}")
107
+ print(f"Token Count : {pages_and_chunks[index]['chunk_token_count']}")
108
  # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
109
  print_wrapped(pages_and_chunks[index]["sentence_chunk"])
110
  # Print the page number too so we can reference the textbook further and check the results
111
  print(f"File of Origin: {pages_and_chunks[index]['file_path']}")
112
  print("\n")
113
 
114
+ return scores, indices
115
+
116
  def prompt_formatter(query: str,
117
  context_items: list[dict]) -> str:
 
 
 
118
  # Join context items into one dotted paragraph
119
+ # print(context_items[0])
120
+
121
+ # Alternate print method
122
+ # print("\n".join([item["file_path"] + "\n" + str(item['chunk_token_count']) + "\n" + item["sentence_chunk"] for item in context_items]))
123
+
124
  context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])
125
 
126
  # Create a base prompt with examples to help the model
 
164
 
165
 
166
  with gr.Blocks() as RulesLawyer:
167
+
168
+ message_state = gr.State()
169
+ chatbot_state = gr.State([])
170
  chatbot = gr.Chatbot()
171
  msg = gr.Textbox()
172
  clear = gr.ClearButton([msg, chatbot])
173
 
174
+ def store_message(message):
175
+
176
+ return message
177
+
178
+
179
  def respond(message, chat_history):
180
+ print(datetime.now())
181
+ print(f"User Input : {message}")
182
+ print(f"Chat History: {chat_history}")
183
+ print(f"""Token Estimate: {hybrid_estimate_tokens(f"{message} {chat_history}")}""")
184
 
185
  # Get relevant resources
186
+ scores, indices = print_top_results_and_scores(query=message,
187
  embeddings=embeddings)
188
+
189
  # Create a list of context items
190
  context_items = [pages_and_chunks[i] for i in indices]
191
+
192
 
193
  # Format prompt with context items
194
+ prompt = prompt_formatter(query=f"Chat History : {chat_history} + {message}",
195
  context_items=context_items)
196
+
197
  bot_message = client.chat.completions.create(
198
  model="gpt-4",
199
  messages=[
 
209
  presence_penalty=0
210
  )
211
  chat_history.append((message, bot_message.choices[0].message.content))
212
+ print(f"Response : {bot_message.choices[0].message.content}")
213
+
214
  time.sleep(2)
215
  return "", chat_history
216
+ msg.change(store_message, inputs = [msg], outputs = [message_state])
217
+ chatbot.change(store_message, [chatbot], [chatbot_state])
218
+ msg.submit(respond, [message_state, chatbot_state], [msg, chatbot])
219
 
220
  if __name__ == "__main__":
221
  RulesLawyer.launch()