Spaces:
Sleeping
Sleeping
Upload 6 files
Browse files- corpus_embeddings_bi_encoder.pickle +3 -0
- df_combined_paris.csv +0 -0
- embeddings.npy +3 -0
- paris-newer.py +297 -0
- paris_clean_newer.csv +0 -0
- tokenized_corpus.pickle +3 -0
corpus_embeddings_bi_encoder.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1854af45783940daefdea27ee8e42f026faefdc4ff4a41067c6ee4ca6eb74ade
|
3 |
+
size 64918
|
df_combined_paris.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
embeddings.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3759225896afa4282dee721d96d1d1a8085cde7ccffe29e975568a5499a36548
|
3 |
+
size 64640
|
paris-newer.py
ADDED
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
+
@author: Hamza Farooq
|
7 |
+
"""
|
8 |
+
|
9 |
+
import spacy
|
10 |
+
from spacy.lang.en.stop_words import STOP_WORDS
|
11 |
+
from string import punctuation
|
12 |
+
from collections import Counter
|
13 |
+
from heapq import nlargest
|
14 |
+
import os
|
15 |
+
nlp = spacy.load("en_core_web_sm")
|
16 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
17 |
+
import datetime
|
18 |
+
|
19 |
+
from spacy import displacy
|
20 |
+
import streamlit as st
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
from wordcloud import WordCloud
|
23 |
+
from matplotlib import pyplot as plt
|
24 |
+
|
25 |
+
import nltk
|
26 |
+
from rank_bm25 import BM25Okapi
|
27 |
+
from sklearn.feature_extraction import _stop_words
|
28 |
+
import string
|
29 |
+
from tqdm.autonotebook import tqdm
|
30 |
+
import numpy as np
|
31 |
+
import pandas as pd
|
32 |
+
from sentence_transformers import SentenceTransformer
|
33 |
+
import scipy.spatial
|
34 |
+
import pickle
|
35 |
+
from sentence_transformers import SentenceTransformer, util
|
36 |
+
import torch
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
# import utils as utl
|
43 |
+
|
44 |
+
import time
|
45 |
+
import torch
|
46 |
+
import transformers
|
47 |
+
from transformers import BartTokenizer, BartForConditionalGeneration
|
48 |
+
from string import punctuation
|
49 |
+
# tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
|
50 |
+
|
51 |
+
import numpy as np
|
52 |
+
import pandas as pd
|
53 |
+
from sentence_transformers import SentenceTransformer
|
54 |
+
import scipy.spatial
|
55 |
+
|
56 |
+
|
57 |
+
from sentence_transformers import SentenceTransformer, util
|
58 |
+
import torch
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
def main():
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
# Settings
|
68 |
+
st.set_page_config(layout="wide", page_title='Paris Hotel Finder', page_icon="🎈" )
|
69 |
+
from string import punctuation
|
70 |
+
punctuation=punctuation+ '\n'
|
71 |
+
|
72 |
+
|
73 |
+
from sentence_transformers import SentenceTransformer, util
|
74 |
+
import torch
|
75 |
+
import numpy as np
|
76 |
+
import pandas as pd
|
77 |
+
from sentence_transformers import SentenceTransformer
|
78 |
+
import scipy.spatial
|
79 |
+
|
80 |
+
from sentence_transformers import SentenceTransformer, util
|
81 |
+
import torch
|
82 |
+
#import os
|
83 |
+
@st.cache(allow_output_mutation=True)
|
84 |
+
def load_model():
|
85 |
+
return SentenceTransformer('all-MiniLM-L6-v2'),SentenceTransformer('multi-qa-MiniLM-L6-cos-v1'),CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
|
86 |
+
embedder,bi_encoder,cross_encoder = load_model()
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
#original_title = '<p style="font-family:IBM Mono; color:Blue; font-size: 20px;">Original image</p>'
|
92 |
+
st.title("travelle - Parisian Hotel Finder")
|
93 |
+
with st.expander("ℹ️ - About this app", expanded=True):
|
94 |
+
|
95 |
+
st.write(
|
96 |
+
"""
|
97 |
+
- travelle is a hotel search engine that allows users to enter free text query to make the search result personalized to user preference as opposed to other travel websites where a user has to spend hours going through hotel list.
|
98 |
+
- We use natural language processing and big data to return results customized for your preferences.
|
99 |
+
- A user can enter just about anything and we will narrow the results to what closely matches your requirements.
|
100 |
+
- For e.g. a user can enter a query like "Hotel near the Eiffel and cheaper than $300 per night with free breakfast" and we will find the closest results
|
101 |
+
"""
|
102 |
+
)
|
103 |
+
|
104 |
+
|
105 |
+
punctuation=punctuation+ '\n'
|
106 |
+
|
107 |
+
|
108 |
+
#import os
|
109 |
+
|
110 |
+
# embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
111 |
+
|
112 |
+
|
113 |
+
|
114 |
+
def lower_case(input_str):
|
115 |
+
input_str = input_str.lower()
|
116 |
+
return input_str
|
117 |
+
|
118 |
+
df_all = pd.read_csv('paris_clean_newer.csv')
|
119 |
+
|
120 |
+
|
121 |
+
df_combined = df_all.sort_values(['Hotel']).groupby('Hotel', sort=False).text.apply(''.join).reset_index(name='all_review')
|
122 |
+
df_combined_paris_summary = pd.read_csv('df_combined_paris.csv')
|
123 |
+
df_combined_paris_summary = df_combined_paris_summary[['Hotel','summary']]
|
124 |
+
|
125 |
+
import re
|
126 |
+
|
127 |
+
# df_combined = pd.read_csv('df_combined.csv')
|
128 |
+
|
129 |
+
df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
|
130 |
+
|
131 |
+
|
132 |
+
df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
|
133 |
+
df_basic = df_all[['Hotel','description','price_per_night']].drop_duplicates()
|
134 |
+
df_basic = df_basic.merge(df_combined_paris_summary,how='left')
|
135 |
+
df_combined_e = df_combined.merge(df_basic)
|
136 |
+
df_combined_e['all_review'] =df_combined_e['description']+ df_combined_e['all_review'] + df_combined_e['price_per_night']
|
137 |
+
|
138 |
+
df = df_combined_e.copy()
|
139 |
+
|
140 |
+
|
141 |
+
df_sentences = df_combined_e.set_index("all_review")
|
142 |
+
|
143 |
+
df_sentences = df_sentences["Hotel"].to_dict()
|
144 |
+
df_sentences_list = list(df_sentences.keys())
|
145 |
+
|
146 |
+
|
147 |
+
|
148 |
+
import pandas as pd
|
149 |
+
from tqdm import tqdm
|
150 |
+
from sentence_transformers import SentenceTransformer, util
|
151 |
+
|
152 |
+
df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
|
153 |
+
#
|
154 |
+
corpus = df_sentences_list
|
155 |
+
# corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
|
156 |
+
corpus_embeddings = np.load('embeddings.npy')
|
157 |
+
|
158 |
+
bi_encoder.max_seq_length = 512 #Truncate long passages to 256 tokens
|
159 |
+
top_k = 32 #Number of passages we want to retrieve with the bi-encoder
|
160 |
+
|
161 |
+
#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
|
162 |
+
|
163 |
+
# corpus_embeddings_h = np.load('embeddings_h_r.npy')
|
164 |
+
|
165 |
+
with open('corpus_embeddings_bi_encoder.pickle', 'rb') as pkl:
|
166 |
+
doc_embedding = pickle.load(pkl)
|
167 |
+
|
168 |
+
with open('tokenized_corpus.pickle', 'rb') as pkl:
|
169 |
+
tokenized_corpus = pickle.load(pkl)
|
170 |
+
|
171 |
+
bm25 = BM25Okapi(tokenized_corpus)
|
172 |
+
passages = corpus
|
173 |
+
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
+
# We lower case our text and remove stop-words from indexing
|
178 |
+
def bm25_tokenizer(text):
|
179 |
+
tokenized_doc = []
|
180 |
+
for token in text.lower().split():
|
181 |
+
token = token.strip(string.punctuation)
|
182 |
+
|
183 |
+
if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
|
184 |
+
tokenized_doc.append(token)
|
185 |
+
return tokenized_doc
|
186 |
+
|
187 |
+
|
188 |
+
def search(query):
|
189 |
+
# q = [str(userinput)]
|
190 |
+
doc = nlp(str(userinput))
|
191 |
+
|
192 |
+
ent_html = displacy.render(doc, style="ent", jupyter=False)
|
193 |
+
# Display the entity visualization in the browser:
|
194 |
+
st.markdown(ent_html, unsafe_allow_html=True)
|
195 |
+
##### BM25 search (lexical search) #####
|
196 |
+
bm25_scores = bm25.get_scores(bm25_tokenizer(query))
|
197 |
+
top_n = np.argpartition(bm25_scores, -5)[-5:]
|
198 |
+
bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
|
199 |
+
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
|
200 |
+
|
201 |
+
bm25list = {}
|
202 |
+
st.title("Top-5 lexical search (BM25) hits")
|
203 |
+
for hit in bm25_hits[0:5]:
|
204 |
+
row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
|
205 |
+
|
206 |
+
st.subheader(row_dict['Hotel'].values[0])
|
207 |
+
de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
|
208 |
+
st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
|
209 |
+
st.write(f'Description: {de.description.values[0]}')
|
210 |
+
st.expander(de.description.values[0],expanded=False)
|
211 |
+
# try:
|
212 |
+
# st.write('Summary')
|
213 |
+
# st.expander(de.summary.values[0],expanded=False)
|
214 |
+
# except:
|
215 |
+
# None
|
216 |
+
# doc = corpus[hit['corpus_id']]
|
217 |
+
# kp.get_key_phrases(doc)
|
218 |
+
|
219 |
+
bm25list[row_dict['Hotel'].values[0]] = de.description.values[0][0:200]
|
220 |
+
|
221 |
+
#### Sematic Search #####
|
222 |
+
# Encode the query using the bi-encoder and find potentially relevant passages
|
223 |
+
question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
|
224 |
+
# question_embedding = question_embedding.cuda()
|
225 |
+
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
|
226 |
+
hits = hits[0] # Get the hits for the first query
|
227 |
+
|
228 |
+
##### Re-Ranking #####
|
229 |
+
# Now, score all retrieved passages with the cross_encoder
|
230 |
+
cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
|
231 |
+
cross_scores = cross_encoder.predict(cross_inp)
|
232 |
+
|
233 |
+
# Sort results by the cross-encoder scores
|
234 |
+
for idx in range(len(cross_scores)):
|
235 |
+
hits[idx]['cross-score'] = cross_scores[idx]
|
236 |
+
|
237 |
+
# Output of top-5 hits from bi-encoder
|
238 |
+
st.write("\n-------------------------\n")
|
239 |
+
st.title("Top-5 Bi-Encoder Retrieval hits")
|
240 |
+
hits = sorted(hits, key=lambda x: x['score'], reverse=True)
|
241 |
+
for hit in hits[0:5]:
|
242 |
+
# st.write("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
|
243 |
+
row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
|
244 |
+
st.subheader(row_dict['Hotel'].values[0])
|
245 |
+
de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
|
246 |
+
st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
|
247 |
+
st.write(f'Description: {de.description.values[0]}')
|
248 |
+
st.expander(de.description.values[0])
|
249 |
+
# try:
|
250 |
+
# st.write('Summary')
|
251 |
+
# st.expander(de.summary.values[0],expanded=False)
|
252 |
+
# except:
|
253 |
+
# None
|
254 |
+
|
255 |
+
# Output of top-5 hits from re-ranker
|
256 |
+
st.write("\n-------------------------\n")
|
257 |
+
st.title("Top-5 Cross-Encoder Re-ranker hits")
|
258 |
+
hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
|
259 |
+
for hit in hits[0:5]:
|
260 |
+
# st.write("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
|
261 |
+
row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
|
262 |
+
st.subheader(row_dict['Hotel'].values[0])
|
263 |
+
de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
|
264 |
+
st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
|
265 |
+
st.write(f'Description: {de.description.values[0]}')
|
266 |
+
st.expander(de.description.values[0])
|
267 |
+
# try:
|
268 |
+
# st.write('Summary')
|
269 |
+
# st.expander(de.summary.values[0],expanded=False)
|
270 |
+
# except:
|
271 |
+
# None
|
272 |
+
|
273 |
+
|
274 |
+
|
275 |
+
|
276 |
+
sampletext = 'e.g. Hotel near Eiffel Tower with big rooms'
|
277 |
+
userinput = st.text_input('Tell us what are you looking in your hotel?','e.g. Hotel near Eiffel Tower with big rooms',autocomplete="on")
|
278 |
+
da = st.date_input(
|
279 |
+
"Date Check-in",
|
280 |
+
datetime.date(2023, 6, 3))
|
281 |
+
|
282 |
+
dst = st.date_input(
|
283 |
+
"Date Check-out",
|
284 |
+
datetime.date(2023, 6, 8))
|
285 |
+
|
286 |
+
|
287 |
+
if not userinput or userinput == sampletext:
|
288 |
+
st.write("Please enter a query to get results")
|
289 |
+
else:
|
290 |
+
query = [str(userinput)]
|
291 |
+
doc = nlp(str(userinput))
|
292 |
+
search(str(userinput))
|
293 |
+
|
294 |
+
# We use cosine-similarity and torch.topk to find the highest 5 scores
|
295 |
+
|
296 |
+
if __name__ == '__main__':
|
297 |
+
main()
|
paris_clean_newer.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenized_corpus.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e99b20be01f7889248d5b3f667df8947ae6ca676f3a525717305e5124c8b739e
|
3 |
+
size 1261235
|