jinwei12 commited on
Commit
c432f1f
1 Parent(s): a82f821

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +471 -60
app.py CHANGED
@@ -1,64 +1,162 @@
1
- import streamlit as st
2
- import plotly.graph_objects as go
3
  import torch
4
- from transformers import AutoModelForTokenClassification, AutoTokenizer
 
5
  import requests
 
 
 
 
 
 
6
 
7
- def search_geonames(location):
8
- api_endpoint = "http://api.geonames.org/searchJSON"
9
- username = "zekun"
10
 
11
- params = {
12
- 'q': location,
13
- 'username': username,
14
- 'maxRows': 5
15
- }
16
 
17
- response = requests.get(api_endpoint, params=params)
18
- data = response.json()
19
 
20
- if 'geonames' in data:
21
- fig = go.Figure()
22
- for place_info in data['geonames']:
23
- latitude = float(place_info.get('lat', 0.0))
24
- longitude = float(place_info.get('lng', 0.0))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- fig.add_trace(go.Scattermapbox(
27
- lat=[latitude],
28
- lon=[longitude],
29
- mode='markers',
30
- marker=go.scattermapbox.Marker(
31
- size=10,
32
- color='orange',
33
- ),
34
- text=[f'Location: {location}'],
35
- hoverinfo="text",
36
- hovertemplate='<b>Location</b>: %{text}',
37
- ))
38
-
39
- fig.update_layout(
40
- mapbox_style="open-street-map",
41
- hovermode='closest',
42
- mapbox=dict(
43
- bearing=0,
44
- center=go.layout.mapbox.Center(
45
- lat=latitude,
46
- lon=longitude
47
- ),
48
- pitch=0,
49
- zoom=2
50
- ))
51
-
52
- st.plotly_chart(fig)
53
-
54
- # Return an empty figure
55
- return go.Figure()
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- def mapping(location):
59
- st.write(f"Mapping location: {location}")
60
 
61
- search_geonames(location)
62
 
63
 
64
 
@@ -81,19 +179,23 @@ def generate_human_readable(tokens,labels):
81
  return ret
82
 
83
 
84
-
85
- def showOnMap(input_sentence):
86
- # get the location names:
87
-
88
  model_name = "zekun-li/geolm-base-toponym-recognition"
89
 
 
90
  tokenizer = AutoTokenizer.from_pretrained(model_name)
91
  model = AutoModelForTokenClassification.from_pretrained(model_name)
92
 
 
93
  tokens = tokenizer.encode(input_sentence, return_tensors="pt")
94
 
 
 
95
  outputs = model(tokens)
96
 
 
 
97
  predicted_labels = torch.argmax(outputs.logits, dim=2)
98
 
99
  predicted_labels = predicted_labels.detach().cpu().numpy()
@@ -108,27 +210,336 @@ def showOnMap(input_sentence):
108
 
109
  query_labels = predicted_labels[0][torch.where(predicted_labels[0] != 0)[0]]
110
 
 
111
  human_readable = generate_human_readable(tokenizer.convert_ids_to_tokens(query_tokens), query_labels)
112
- #['Los Angeles', 'L . A .', 'California', 'U . S .', 'Southern California', 'Los Angeles', 'United States', 'New York City']
113
 
114
- return human_readable
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
 
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
 
119
 
120
  def show_on_map():
121
 
 
 
122
  input = st.text_area("Enter a sentence:", height=200)
123
 
124
  st.button("Submit")
125
 
126
- places = showOnMap(input)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- selected_place = st.selectbox("Select a location:", places)
129
- mapping(selected_place)
130
 
131
 
132
 
133
  if __name__ == "__main__":
 
 
 
 
134
  show_on_map()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
+ from transformers import AutoTokenizer,AutoModelForTokenClassification
3
+ from transformers import GeoLMModel
4
  import requests
5
+ import numpy as np
6
+ import pandas as pd
7
+ import scipy.spatial as sp
8
+ import streamlit as st
9
+ import folium
10
+ from streamlit.components.v1 import html
11
 
 
 
 
12
 
13
+ from haversine import haversine, Unit
 
 
 
 
14
 
 
 
15
 
16
+ dataset=None
17
+
18
+
19
+ def generate_human_readable(tokens,labels):
20
+ ret = []
21
+ for t,lab in zip(tokens,labels):
22
+ if t == '[SEP]':
23
+ continue
24
+
25
+ if t.startswith("##") :
26
+ assert len(ret) > 0
27
+ ret[-1] = ret[-1] + t.strip('##')
28
+
29
+ elif lab==2:
30
+ assert len(ret) > 0
31
+ ret[-1] = ret[-1] + " "+ t.strip('##')
32
+ else:
33
+ ret.append(t)
34
+
35
+ return ret
36
+
37
+ def getSlice(tensor):
38
+ result = []
39
+ curr = []
40
+ for index, value in enumerate(tensor[0]):
41
+ if value == 1 or value == 2:
42
+ curr.append(index)
43
+
44
+ if value == 0 and curr != []:
45
+ result.append(curr)
46
+ curr = []
47
+
48
+ return result
49
+
50
+ def getIndex(input):
51
+
52
+ # Model name from Hugging Face model hub
53
+ model_name = "zekun-li/geolm-base-toponym-recognition"
54
+
55
+ # Load tokenizer and model
56
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
57
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
58
+
59
+ # Tokenize input sentence
60
+ tokens = tokenizer.encode(input, return_tensors="pt")
61
+
62
+
63
+ # Pass tokens through the model
64
+ outputs = model(tokens)
65
+
66
+
67
+ # Retrieve predicted labels for each token
68
+ predicted_labels = torch.argmax(outputs.logits, dim=2)
69
+
70
+ predicted_labels = predicted_labels.detach().cpu().numpy()
71
+
72
+ # "id2label": { "0": "O", "1": "B-Topo", "2": "I-Topo" }
73
+
74
+ predicted_labels = [model.config.id2label[label] for label in predicted_labels[0]]
75
+ # print(predicted_labels)
76
+
77
+ predicted_labels = torch.argmax(outputs.logits, dim=2)
78
+
79
+ # print(predicted_labels)
80
+
81
+ query_tokens = tokens[0][torch.where(predicted_labels[0] != 0)[0]]
82
+
83
+ query_labels = predicted_labels[0][torch.where(predicted_labels[0] != 0)[0]]
84
+
85
+ print(predicted_labels)
86
+ print(predicted_labels.shape)
87
+
88
+ slices=getSlice(predicted_labels)
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ # print(tokenizer.convert_ids_to_tokens(query_tokens))
92
+
93
+
94
+ return slices
95
+
96
+ def cutSlices(tensor, slicesList):
97
+
98
+ locationTensor= torch.zeros(1, len(slicesList), 768)
99
+
100
+ curr=0
101
+ for slice in slicesList:
102
+
103
+ if len(slice)==1:
104
+ locationTensor[0][curr] = tensor[0][slice[0]]
105
+ curr=curr+1
106
+ if len(slice)>1 :
107
+
108
+ sliceTensor=tensor[0][slice[0]:slice[-1]+1]
109
+ #(len, 768)-> (1,len, 768)
110
+ sliceTensor = sliceTensor.unsqueeze(0)
111
+
112
+ mean = torch.mean(sliceTensor,dim=1,keepdim=True)
113
+
114
+ locationTensor[0][curr] = mean[0]
115
+
116
+ curr=curr+1
117
+
118
+
119
+ return locationTensor
120
+
121
+
122
+
123
+
124
+
125
+
126
+ def MLearningFormInput(input):
127
+
128
+
129
+ model_name = "zekun-li/geolm-base-cased"
130
+
131
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
132
+
133
+ model = GeoLMModel.from_pretrained(model_name)
134
+
135
+ tokens = tokenizer.encode(input, return_tensors="pt")
136
+
137
+ # ['[CLS]', 'Minneapolis','[SEP]','Saint','Paul','[SEP]','Du','##lut','##h','[SEP]']
138
+ # print(tokens)
139
+
140
+
141
+ outputs = model(tokens, spatial_position_list_x=torch.zeros(tokens.shape), spatial_position_list_y=torch.zeros(tokens.shape))
142
+
143
+
144
+ # print(outputs.last_hidden_state)
145
+
146
+ # print(outputs.last_hidden_state.shape)
147
+
148
+
149
+ slicesIndex=getIndex(input)
150
+
151
+ # print(slicesIndex)
152
+
153
+ #tensor -> tensor
154
+ res= cutSlices(outputs.last_hidden_state, slicesIndex)
155
+
156
+
157
+ return res
158
 
 
 
159
 
 
160
 
161
 
162
 
 
179
  return ret
180
 
181
 
182
+ def getLocationName(input_sentence):
183
+ # Model name from Hugging Face model hub
 
 
184
  model_name = "zekun-li/geolm-base-toponym-recognition"
185
 
186
+ # Load tokenizer and model
187
  tokenizer = AutoTokenizer.from_pretrained(model_name)
188
  model = AutoModelForTokenClassification.from_pretrained(model_name)
189
 
190
+ # Tokenize input sentence
191
  tokens = tokenizer.encode(input_sentence, return_tensors="pt")
192
 
193
+
194
+ # Pass tokens through the model
195
  outputs = model(tokens)
196
 
197
+
198
+ # Retrieve predicted labels for each token
199
  predicted_labels = torch.argmax(outputs.logits, dim=2)
200
 
201
  predicted_labels = predicted_labels.detach().cpu().numpy()
 
210
 
211
  query_labels = predicted_labels[0][torch.where(predicted_labels[0] != 0)[0]]
212
 
213
+
214
  human_readable = generate_human_readable(tokenizer.convert_ids_to_tokens(query_tokens), query_labels)
 
215
 
216
+ return human_readable
217
+
218
+
219
+
220
+ def search_geonames(toponym, df):
221
+ # GeoNames API endpoint
222
+ api_endpoint = "http://api.geonames.org/searchJSON"
223
+
224
+ username = "zekun"
225
+
226
+ print(toponym)
227
+
228
+ params = {
229
+ 'q': toponym,
230
+ 'username': username,
231
+ 'maxRows':10
232
+ }
233
+
234
+ response = requests.get(api_endpoint, params=params)
235
+ data = response.json()
236
+
237
+ result = []
238
+
239
+ lat=[]
240
+ lon=[]
241
+
242
+ if 'geonames' in data:
243
+ for place_info in data['geonames']:
244
+ latitude = float(place_info.get('lat', 0.0))
245
+ longitude = float(place_info.get('lng', 0.0))
246
+
247
+ lat.append(latitude)
248
+ lon.append(longitude)
249
+
250
+ print(latitude)
251
+ print(longitude)
252
 
253
+ # getNeighborsDistance
254
 
255
+ id = place_info.get('geonameId', '')
256
+
257
+ print(id)
258
+
259
+ global dataset
260
+ res = get50Neigbors(id, dataset, k=50)
261
+ result.append(res)
262
+ # candidate_places.append({
263
+ # 'name': place_info.get('name', ''),
264
+ # 'country': place_info.get('countryName', ''),
265
+ # 'latitude': latitude,
266
+ # 'longitude': longitude,
267
+
268
+ # })
269
+ print(res)
270
+
271
+
272
+ df['lat'] = lat
273
+ df['lon'] = lon
274
+ result = torch.cat(result, dim=1).detach().numpy()
275
+ return result
276
+
277
+
278
+
279
+ def get50Neigbors(locationID, dataset, k=50):
280
+
281
+ input_row = dataset.loc[dataset['GeonameID'] == locationID].iloc[0]
282
+
283
+
284
+ lat, lon, geohash,name = input_row['Latitude'], input_row['Longitude'], input_row['Geohash'], input_row['Name']
285
+
286
+ filtered_dataset = dataset.loc[dataset['Geohash'].str.startswith(geohash[:5])].copy()
287
+
288
+ filtered_dataset['distance'] = filtered_dataset.apply(
289
+ lambda row: haversine((lat, lon), (row['Latitude'], row['Longitude']), Unit.KILOMETERS),
290
+ axis=1
291
+ ).copy()
292
+
293
+
294
+ filtered_dataset = filtered_dataset.sort_values(by='distance')
295
+
296
+
297
+
298
+ nearest_neighbors = filtered_dataset.head(k)[['Name']]
299
+
300
+
301
+ neighbors=nearest_neighbors.values.tolist()
302
+
303
+
304
+ model_name = "zekun-li/geolm-base-toponym-recognition"
305
+
306
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
307
+
308
+ sep_token_id = tokenizer.convert_tokens_to_ids(tokenizer.sep_token)
309
+ cls_token_id = tokenizer.convert_tokens_to_ids(tokenizer.cls_token)
310
+
311
+
312
+ neighbor_token_list = []
313
+ neighbor_token_list.append(cls_token_id)
314
+
315
+ target_token=tokenizer.convert_tokens_to_ids(tokenizer.tokenize(name))
316
+
317
+
318
+
319
+ for neighbor in neighbors:
320
+
321
+
322
+ neighbor_token = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(neighbor[0]))
323
+ neighbor_token_list.extend(neighbor_token)
324
+ neighbor_token_list.append(sep_token_id)
325
+
326
+
327
+ # print(tokenizer.convert_ids_to_tokens(neighbor_token_list))
328
+
329
+ #--------------------------------------------
330
+
331
+ model = GeoLMModel.from_pretrained(model_name)
332
+
333
+
334
+ tokens = torch.Tensor(neighbor_token_list).unsqueeze(0).long()
335
+
336
+
337
+ # input "new neighbor sentence"-> model -> output
338
+ outputs = model(tokens, spatial_position_list_x=torch.zeros(tokens.shape), spatial_position_list_y=torch.zeros(tokens.shape))
339
+
340
+
341
+
342
+ # print(outputs.last_hidden_state)
343
+
344
+ # print(outputs.last_hidden_state.shape)
345
+
346
+
347
+ targetIndex=list(range(1, len(target_token)+1))
348
+
349
+ # #tensor -> tensor
350
+ # get (1, len(target_token), 768) -> (1, 1, 768)
351
+ res=cutSlices(outputs.last_hidden_state, [targetIndex])
352
+
353
+
354
+ return res
355
+
356
+
357
+
358
+ def cosine_similarity(target_feature, candidate_feature):
359
+
360
+ target_feature = target_feature.squeeze()
361
+ candidate_feature = candidate_feature.squeeze()
362
+
363
+ dot_product = torch.dot(target_feature, candidate_feature)
364
+
365
+ target = torch.norm(target_feature)
366
+ candidate = torch.norm(candidate_feature)
367
+
368
+ similarity = dot_product / (target * candidate)
369
+
370
+ return similarity.item()
371
+
372
+
373
+ @st.cache_data
374
+
375
+ def getCSV():
376
+ dataset = pd.read_csv('geohash.csv')
377
+
378
+ return dataset
379
+
380
+ def showing(df):
381
+
382
+ m = folium.Map(location=[df['lat'].mean(), df['lon'].mean()], zoom_start=5)
383
+
384
+ size_scale = 100
385
+ color_scale = 255
386
+
387
+ for i in range(len(df)):
388
+ lat, lon, prob = df.iloc[i]['lat'], df.iloc[i]['lon'], df.iloc[i]['prob']
389
+
390
+ size = int(prob**2 * size_scale )
391
+ color = int(prob**2 * color_scale)
392
+
393
+ folium.CircleMarker(
394
+ location=[lat, lon],
395
+ radius=size,
396
+ color=f'#{color:02X}0000',
397
+ fill=True,
398
+ fill_color=f'#{color:02X}0000'
399
+ ).add_to(m)
400
+
401
+ m.save("map.html")
402
+
403
+ with open("map.html", "r", encoding="utf-8") as f:
404
+ map_html = f.read()
405
+
406
+ st.components.v1.html(map_html, height=600)
407
+
408
+
409
+ def mapping(selected_place,locations, sentence_info):
410
+ location_index = locations.index(selected_place)
411
+ print(location_index)
412
+
413
+ df = pd.DataFrame()
414
+
415
+ # get same name for "Beijing" in geonames
416
+ same_name_embedding=search_geonames(selected_place, df)
417
+
418
+
419
+ sim_matrix=[]
420
+ print(sim_matrix)
421
+
422
+
423
+ same_name_embedding=torch.tensor(same_name_embedding)
424
+ # loop each "Beijing"
425
+ for i in range(same_name_embedding.size(1)):
426
+ print((sentence_info[:, location_index, :]).shape)
427
+ print((same_name_embedding[:, i, :]).shape)
428
+
429
+ similarities = cosine_similarity(sentence_info[:, location_index, :], same_name_embedding[:, i, :])
430
+ sim_matrix.append(similarities)
431
+
432
+ # print("Cosine Similarity Matrix:")
433
+ # print(sim_matrix)
434
+
435
+ def sigmoid(x):
436
+ return 1 / (1 + np.exp(-x))
437
+
438
+ prob_matrix = sigmoid(np.array(sim_matrix))
439
+
440
+
441
+ df['prob'] = prob_matrix
442
+
443
+
444
+ print(df)
445
+
446
+ showing(df)
447
 
448
 
449
 
450
  def show_on_map():
451
 
452
+
453
+
454
  input = st.text_area("Enter a sentence:", height=200)
455
 
456
  st.button("Submit")
457
 
458
+ sentence_info= MLearningFormInput(input)
459
+
460
+ print("sentence info: ")
461
+ print(sentence_info)
462
+ print(sentence_info.shape)
463
+
464
+
465
+ # input: a sentence -> output : locations
466
+ locations=getLocationName(input)
467
+
468
+ # 1. input: a sentence -> output: tensor (1sentence_info
469
+ selected_place = st.selectbox("Select a location:", locations)
470
+
471
+ if selected_place is not None:
472
+
473
+ mapping(selected_place, locations, sentence_info)
474
 
 
 
475
 
476
 
477
 
478
  if __name__ == "__main__":
479
+
480
+
481
+ dataset = getCSV()
482
+
483
  show_on_map()
484
+
485
+
486
+ # # can be hidding.............................................................
487
+
488
+ # #len: 80
489
+ # input= 'Minneapolis, officially the City of Minneapolis, is a city in the state of Minnesota and the county seat of Hennepin County. making it the largest city in Minnesota and the 46th-most-populous in the United States. Nicknamed the "City of Lakes", Minneapolis is abundant in water, with thirteen lakes, wetlands, the Mississippi River, creeks, and waterfalls.'
490
+
491
+
492
+ # 1. input: a sentence -> output: tensor (1,num_locations,768)
493
+ # sentence_info= MLearningFormInput(input)
494
+
495
+ # print("sentence info: ")
496
+ # print(sentence_info)
497
+ # print(sentence_info.shape)
498
+
499
+
500
+
501
+ # # input: a sentence -> output : locations
502
+ # locations=getLocationName(input)
503
+
504
+ # print(locations)
505
+
506
+ # j=0
507
+
508
+
509
+ # k=0
510
+
511
+ # for location in locations:
512
+
513
+ # if k==0:
514
+
515
+ # # input: locations -> output: search in geoname(get top 10 items) -> loop each item -> num_location x 10 x (1,1,768)
516
+ # same_name_embedding=search_geonames(location)
517
+
518
+ # sim_matrix=[]
519
+ # print(sim_matrix)
520
+
521
+
522
+
523
+
524
+
525
+ # same_name_embedding=torch.tensor(same_name_embedding)
526
+ # # loop each "Beijing"
527
+ # for i in range(same_name_embedding.size(1)):
528
+ # # print((sentence_info[:, j, :]).shape)
529
+ # # print((same_name_embedding[:, i, :]).shape)
530
+
531
+ # similarities = cosine_similarity(sentence_info[:, j, :], same_name_embedding[:, i, :])
532
+ # sim_matrix.append(similarities)
533
+
534
+
535
+
536
+ # j=j+1
537
+
538
+
539
+ # print("Cosine Similarity Matrix:")
540
+ # print(sim_matrix)
541
+
542
+ # k=1
543
+
544
+ # else:
545
+ # break