awacke1 commited on
Commit
903df6b
1 Parent(s): 4cda255

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -218
app.py CHANGED
@@ -1,220 +1,3 @@
1
- import spacy
2
- import wikipediaapi
3
- import wikipedia
4
- from wikipedia.exceptions import DisambiguationError
5
- from transformers import TFAutoModel, AutoTokenizer
6
- import numpy as np
7
- import pandas as pd
8
- import faiss
9
- import gradio as gr
10
-
11
- try:
12
- nlp = spacy.load("en_core_web_sm")
13
- except:
14
- spacy.cli.download("en_core_web_sm")
15
- nlp = spacy.load("en_core_web_sm")
16
-
17
- wh_words = ['what', 'who', 'how', 'when', 'which']
18
- def get_concepts(text):
19
- text = text.lower()
20
- doc = nlp(text)
21
- concepts = []
22
- for chunk in doc.noun_chunks:
23
- if chunk.text not in wh_words:
24
- concepts.append(chunk.text)
25
- return concepts
26
-
27
- def get_passages(text, k=100):
28
- doc = nlp(text)
29
- passages = []
30
- passage_len = 0
31
- passage = ""
32
- sents = list(doc.sents)
33
- for i in range(len(sents)):
34
- sen = sents[i]
35
- passage_len+=len(sen)
36
- if passage_len >= k:
37
- passages.append(passage)
38
- passage = sen.text
39
- passage_len = len(sen)
40
- continue
41
-
42
- elif i==(len(sents)-1):
43
- passage+=" "+sen.text
44
- passages.append(passage)
45
- passage = ""
46
- passage_len = 0
47
- continue
48
-
49
- passage+=" "+sen.text
50
- return passages
51
-
52
- def get_dicts_for_dpr(concepts, n_results=20, k=100):
53
- dicts = []
54
- for concept in concepts:
55
- wikis = wikipedia.search(concept, results=n_results)
56
- print(concept, "No of Wikis: ",len(wikis))
57
- for wiki in wikis:
58
- try:
59
- html_page = wikipedia.page(title = wiki, auto_suggest = False)
60
- except DisambiguationError:
61
- continue
62
-
63
- htmlResults=html_page.content
64
-
65
- passages = get_passages(htmlResults, k=k)
66
- for passage in passages:
67
- i_dicts = {}
68
- i_dicts['text'] = passage
69
- i_dicts['title'] = wiki
70
- dicts.append(i_dicts)
71
- return dicts
72
-
73
- passage_encoder = TFAutoModel.from_pretrained("nlpconnect/dpr-ctx_encoder_bert_uncased_L-2_H-128_A-2")
74
- query_encoder = TFAutoModel.from_pretrained("nlpconnect/dpr-question_encoder_bert_uncased_L-2_H-128_A-2")
75
- p_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/dpr-ctx_encoder_bert_uncased_L-2_H-128_A-2")
76
- q_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/dpr-question_encoder_bert_uncased_L-2_H-128_A-2")
77
-
78
- def get_title_text_combined(passage_dicts):
79
- res = []
80
- for p in passage_dicts:
81
- res.append(tuple((p['title'], p['text'])))
82
- return res
83
-
84
- def extracted_passage_embeddings(processed_passages, max_length=156):
85
- passage_inputs = p_tokenizer.batch_encode_plus(
86
- processed_passages,
87
- add_special_tokens=True,
88
- truncation=True,
89
- padding="max_length",
90
- max_length=max_length,
91
- return_token_type_ids=True
92
- )
93
- passage_embeddings = passage_encoder.predict([np.array(passage_inputs['input_ids']),
94
- np.array(passage_inputs['attention_mask']),
95
- np.array(passage_inputs['token_type_ids'])],
96
- batch_size=64,
97
- verbose=1)
98
- return passage_embeddings
99
-
100
- def extracted_query_embeddings(queries, max_length=64):
101
- query_inputs = q_tokenizer.batch_encode_plus(
102
- queries,
103
- add_special_tokens=True,
104
- truncation=True,
105
- padding="max_length",
106
- max_length=max_length,
107
- return_token_type_ids=True
108
- )
109
- query_embeddings = query_encoder.predict([np.array(query_inputs['input_ids']),
110
- np.array(query_inputs['attention_mask']),
111
- np.array(query_inputs['token_type_ids'])],
112
- batch_size=1,
113
- verbose=1)
114
- return query_embeddings
115
-
116
- #Wikipedia API:
117
-
118
- def get_pagetext(page):
119
- s=str(page).replace("/t","")
120
-
121
- return s
122
-
123
- def get_wiki_summary(search):
124
- wiki_wiki = wikipediaapi.Wikipedia('en')
125
- page = wiki_wiki.page(search)
126
-
127
- isExist = page.exists()
128
- if not isExist:
129
- return isExist, "Not found", "Not found", "Not found", "Not found"
130
-
131
- pageurl = page.fullurl
132
- pagetitle = page.title
133
- pagesummary = page.summary[0:60]
134
- pagetext = get_pagetext(page.text)
135
-
136
- backlinks = page.backlinks
137
- linklist = ""
138
- for link in backlinks.items():
139
- pui = link[0]
140
- linklist += pui + " , "
141
- a=1
142
-
143
- categories = page.categories
144
- categorylist = ""
145
- for category in categories.items():
146
- pui = category[0]
147
- categorylist += pui + " , "
148
- a=1
149
-
150
- links = page.links
151
- linklist2 = ""
152
- for link in links.items():
153
- pui = link[0]
154
- linklist2 += pui + " , "
155
- a=1
156
-
157
- sections = page.sections
158
-
159
-
160
- ex_dic = {
161
- 'Entity' : ["URL","Title","Summary", "Text", "Backlinks", "Links", "Categories"],
162
- 'Value': [pageurl, pagetitle, pagesummary, pagetext, linklist,linklist2, categorylist ]
163
- }
164
-
165
- #columns = [pageurl,pagetitle]
166
- #index = [pagesummary,pagetext]
167
- #df = pd.DataFrame(page, columns=columns, index=index)
168
- #df = pd.DataFrame(ex_dic, columns=columns, index=index)
169
- df = pd.DataFrame(ex_dic)
170
-
171
- return df
172
-
173
-
174
- def search(question):
175
- concepts = get_concepts(question)
176
- print("concepts: ",concepts)
177
- dicts = get_dicts_for_dpr(concepts, n_results=1)
178
- lendicts = len(dicts)
179
- print("dicts len: ", lendicts)
180
- if lendicts == 0:
181
- return pd.DataFrame()
182
- processed_passages = get_title_text_combined(dicts)
183
- passage_embeddings = extracted_passage_embeddings(processed_passages)
184
- query_embeddings = extracted_query_embeddings([question])
185
- faiss_index = faiss.IndexFlatL2(128)
186
- faiss_index.add(passage_embeddings.pooler_output)
187
- # prob, index = faiss_index.search(query_embeddings.pooler_output, k=1000)
188
- prob, index = faiss_index.search(query_embeddings.pooler_output, k=lendicts)
189
- return pd.DataFrame([dicts[i] for i in index[0]])
190
-
191
-
192
- # AI UI SOTA - gradio blocks with UI formatting, and event driven UI
193
- with gr.Blocks() as demo: # Block documentation on event listeners, start here: https://gradio.app/blocks_and_event_listeners/
194
-
195
-
196
- gr.Markdown("<h1><center>🍰 Ultimate Wikipedia AI 🎨</center></h1>")
197
- gr.Markdown("""<div align="center">Search and Find Anything Then Use in AI! <a href="https://www.mediawiki.org/wiki/API:Main_page">MediaWiki - API for Wikipedia</a>. <a href="https://paperswithcode.com/datasets?q=wikipedia&v=lst&o=newest">Papers,Code,Datasets for SOTA w/ Wikipedia</a>""")
198
- with gr.Row(): # inputs and buttons
199
- inp = gr.Textbox(lines=1, default="Syd Mead", label="Question")
200
- with gr.Row(): # inputs and buttons
201
- b3 = gr.Button("Search AI Summaries")
202
- b4 = gr.Button("Search Web Live")
203
- with gr.Row(): # outputs DF1
204
- out = gr.Dataframe(label="Answers", type="pandas")
205
- with gr.Row(): # output DF2
206
- out_DF = gr.Dataframe(wrap=True, max_rows=1000, overflow_row_behaviour= "paginate", datatype = ["markdown", "markdown"], headers=['Entity', 'Value'])
207
- inp.submit(fn=get_wiki_summary, inputs=inp, outputs=out_DF)
208
- b3.click(fn=search, inputs=inp, outputs=out)
209
- b4.click(fn=get_wiki_summary, inputs=inp, outputs=out_DF )
210
- demo.launch(debug=True, show_error=True)
211
-
212
-
213
-
214
-
215
-
216
-
217
-
218
  UseMemory=True
219
 
220
  HF_TOKEN=os.environ.get("HF_TOKEN")
@@ -315,7 +98,7 @@ def chat(message, history):
315
 
316
  if UseMemory:
317
  #outputfileName = 'ChatbotMemory.csv'
318
- outputfileName = 'ChatbotMemory2.csv' # Test first time file create
319
  df = store_message(message, response, outputfileName) # Save to dataset
320
  basedir = get_base(outputfileName)
321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  UseMemory=True
2
 
3
  HF_TOKEN=os.environ.get("HF_TOKEN")
 
98
 
99
  if UseMemory:
100
  #outputfileName = 'ChatbotMemory.csv'
101
+ outputfileName = 'ChatbotMemory3.csv' # Test first time file create
102
  df = store_message(message, response, outputfileName) # Save to dataset
103
  basedir = get_base(outputfileName)
104