Spaces:

jinwei12
/

test

Sleeping

App Files Files Community

jinwei12 commited on Dec 8, 2023

Commit

c432f1f

•

1 Parent(s): a82f821

Update app.py

Browse files

Files changed (1) hide show

app.py +471 -60

app.py CHANGED Viewed

@@ -1,64 +1,162 @@
-import streamlit as st
-import plotly.graph_objects as go
 import torch
-from transformers import AutoModelForTokenClassification, AutoTokenizer
 import requests
-def search_geonames(location):
-    api_endpoint = "http://api.geonames.org/searchJSON"
-    username = "zekun"
-    params = {
-        'q': location,
-        'username': username,
-        'maxRows': 5
-    }
-    response = requests.get(api_endpoint, params=params)
-    data = response.json()
-    if 'geonames' in data:
-        fig = go.Figure()
-        for place_info in data['geonames']:
-            latitude = float(place_info.get('lat', 0.0))
-            longitude = float(place_info.get('lng', 0.0))
-            fig.add_trace(go.Scattermapbox(
-                lat=[latitude],
-                lon=[longitude],
-                mode='markers',
-                marker=go.scattermapbox.Marker(
-                    size=10,
-                    color='orange',
-                ),
-                text=[f'Location: {location}'],
-                hoverinfo="text",
-                hovertemplate='<b>Location</b>: %{text}',
-            ))
-        fig.update_layout(
-            mapbox_style="open-street-map",
-            hovermode='closest',
-            mapbox=dict(
-                bearing=0,
-                center=go.layout.mapbox.Center(
-                    lat=latitude,
-                    lon=longitude
-                ),
-                pitch=0,
-                zoom=2
-            ))
-        st.plotly_chart(fig)
-    # Return an empty figure
-    return go.Figure()
-def mapping(location):
-    st.write(f"Mapping location: {location}")
-    search_geonames(location)
@@ -81,19 +179,23 @@ def generate_human_readable(tokens,labels):
     return ret
-def showOnMap(input_sentence):
-    # get the location names:
     model_name = "zekun-li/geolm-base-toponym-recognition"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForTokenClassification.from_pretrained(model_name)
     tokens = tokenizer.encode(input_sentence, return_tensors="pt")
     outputs = model(tokens)
     predicted_labels = torch.argmax(outputs.logits, dim=2)
     predicted_labels = predicted_labels.detach().cpu().numpy()
@@ -108,27 +210,336 @@ def showOnMap(input_sentence):
     query_labels = predicted_labels[0][torch.where(predicted_labels[0] != 0)[0]]
     human_readable = generate_human_readable(tokenizer.convert_ids_to_tokens(query_tokens), query_labels)
-    #['Los Angeles', 'L . A .', 'California', 'U . S .', 'Southern California', 'Los Angeles', 'United States', 'New York City']
-    return human_readable
 def show_on_map():
     input = st.text_area("Enter a sentence:", height=200)
     st.button("Submit")
-    places = showOnMap(input)
-    selected_place = st.selectbox("Select a location:", places)
-    mapping(selected_place)
 if __name__ == "__main__":
     show_on_map()

 import torch
+from transformers import AutoTokenizer,AutoModelForTokenClassification
+from transformers import GeoLMModel
 import requests
+import numpy as np
+import pandas as pd
+import scipy.spatial as sp
+import streamlit as st
+import folium
+from streamlit.components.v1 import html
+from haversine import haversine, Unit
+dataset=None
+def generate_human_readable(tokens,labels):
+    ret = []
+    for t,lab in zip(tokens,labels):
+        if t == '[SEP]':
+            continue
+        if t.startswith("##") :
+            assert len(ret) > 0
+            ret[-1] = ret[-1] + t.strip('##')
+        elif lab==2:
+            assert len(ret) > 0
+            ret[-1] = ret[-1] + " "+ t.strip('##')
+        else:
+            ret.append(t)
+    return ret
+def getSlice(tensor):
+    result = []
+    curr = []
+    for index, value in enumerate(tensor[0]):
+        if value == 1 or value == 2:
+            curr.append(index)
+        if value == 0 and curr != []:
+            result.append(curr)
+            curr = []
+    return result
+def getIndex(input):
+    # Model name from Hugging Face model hub
+    model_name = "zekun-li/geolm-base-toponym-recognition"
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForTokenClassification.from_pretrained(model_name)
+    # Tokenize input sentence
+    tokens = tokenizer.encode(input, return_tensors="pt")
+    # Pass tokens through the model
+    outputs = model(tokens)
+    # Retrieve predicted labels for each token
+    predicted_labels = torch.argmax(outputs.logits, dim=2)
+    predicted_labels = predicted_labels.detach().cpu().numpy()
+    # "id2label": { "0": "O", "1": "B-Topo", "2": "I-Topo"  }
+    predicted_labels = [model.config.id2label[label] for label in predicted_labels[0]]
+    # print(predicted_labels)
+    predicted_labels = torch.argmax(outputs.logits, dim=2)
+    # print(predicted_labels)
+    query_tokens = tokens[0][torch.where(predicted_labels[0] != 0)[0]]
+    query_labels = predicted_labels[0][torch.where(predicted_labels[0] != 0)[0]]
+    print(predicted_labels)
+    print(predicted_labels.shape)
+    slices=getSlice(predicted_labels)
+    # print(tokenizer.convert_ids_to_tokens(query_tokens))
+    return slices
+def cutSlices(tensor, slicesList):
+    locationTensor= torch.zeros(1, len(slicesList), 768)
+    curr=0
+    for slice in slicesList:
+        if len(slice)==1:
+            locationTensor[0][curr] = tensor[0][slice[0]]
+            curr=curr+1
+        if len(slice)>1 :
+            sliceTensor=tensor[0][slice[0]:slice[-1]+1]
+            #(len, 768)-> (1,len, 768)
+            sliceTensor = sliceTensor.unsqueeze(0)
+            mean = torch.mean(sliceTensor,dim=1,keepdim=True)
+            locationTensor[0][curr] = mean[0]
+            curr=curr+1
+    return locationTensor
+def MLearningFormInput(input):
+    model_name = "zekun-li/geolm-base-cased"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = GeoLMModel.from_pretrained(model_name)
+    tokens = tokenizer.encode(input, return_tensors="pt")
+     # ['[CLS]', 'Minneapolis','[SEP]','Saint','Paul','[SEP]','Du','##lut','##h','[SEP]']
+    # print(tokens)
+    outputs = model(tokens, spatial_position_list_x=torch.zeros(tokens.shape), spatial_position_list_y=torch.zeros(tokens.shape))
+    # print(outputs.last_hidden_state)
+    # print(outputs.last_hidden_state.shape)
+    slicesIndex=getIndex(input)
+    # print(slicesIndex)
+    #tensor -> tensor
+    res= cutSlices(outputs.last_hidden_state, slicesIndex)
+    return res
     return ret
+def getLocationName(input_sentence):
+    # Model name from Hugging Face model hub
     model_name = "zekun-li/geolm-base-toponym-recognition"
+    # Load tokenizer and model
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForTokenClassification.from_pretrained(model_name)
+    # Tokenize input sentence
     tokens = tokenizer.encode(input_sentence, return_tensors="pt")
+    # Pass tokens through the model
     outputs = model(tokens)
+    # Retrieve predicted labels for each token
     predicted_labels = torch.argmax(outputs.logits, dim=2)
     predicted_labels = predicted_labels.detach().cpu().numpy()
     query_labels = predicted_labels[0][torch.where(predicted_labels[0] != 0)[0]]
     human_readable = generate_human_readable(tokenizer.convert_ids_to_tokens(query_tokens), query_labels)
+    return human_readable
+def search_geonames(toponym, df):
+    # GeoNames API endpoint
+    api_endpoint = "http://api.geonames.org/searchJSON"
+    username = "zekun"
+    print(toponym)
+    params = {
+        'q': toponym,
+        'username': username,
+        'maxRows':10
+    }
+    response = requests.get(api_endpoint, params=params)
+    data = response.json()
+    result = []
+    lat=[]
+    lon=[]
+    if 'geonames' in data:
+        for place_info in data['geonames']:
+            latitude = float(place_info.get('lat', 0.0))
+            longitude = float(place_info.get('lng', 0.0))
+            lat.append(latitude)
+            lon.append(longitude)
+            print(latitude)
+            print(longitude)
+            # getNeighborsDistance
+            id = place_info.get('geonameId', '')
+            print(id)
+            global dataset
+            res = get50Neigbors(id, dataset, k=50)
+            result.append(res)
+            # candidate_places.append({
+            #     'name': place_info.get('name', ''),
+            #     'country': place_info.get('countryName', ''),
+            #     'latitude': latitude,
+            #     'longitude': longitude,
+            # })
+            print(res)
+    df['lat'] = lat
+    df['lon'] = lon
+    result = torch.cat(result, dim=1).detach().numpy()
+    return result
+def get50Neigbors(locationID, dataset, k=50):
+    input_row = dataset.loc[dataset['GeonameID'] == locationID].iloc[0]
+    lat, lon, geohash,name = input_row['Latitude'], input_row['Longitude'], input_row['Geohash'], input_row['Name']
+    filtered_dataset = dataset.loc[dataset['Geohash'].str.startswith(geohash[:5])].copy()
+    filtered_dataset['distance'] = filtered_dataset.apply(
+        lambda row: haversine((lat, lon), (row['Latitude'], row['Longitude']), Unit.KILOMETERS),
+        axis=1
+    ).copy()
+    filtered_dataset = filtered_dataset.sort_values(by='distance')
+    nearest_neighbors = filtered_dataset.head(k)[['Name']]
+    neighbors=nearest_neighbors.values.tolist()
+    model_name = "zekun-li/geolm-base-toponym-recognition"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    sep_token_id = tokenizer.convert_tokens_to_ids(tokenizer.sep_token)
+    cls_token_id = tokenizer.convert_tokens_to_ids(tokenizer.cls_token)
+    neighbor_token_list = []
+    neighbor_token_list.append(cls_token_id)
+    target_token=tokenizer.convert_tokens_to_ids(tokenizer.tokenize(name))
+    for neighbor in neighbors:
+        neighbor_token = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(neighbor[0]))
+        neighbor_token_list.extend(neighbor_token)
+        neighbor_token_list.append(sep_token_id)
+    # print(tokenizer.convert_ids_to_tokens(neighbor_token_list))
+    #--------------------------------------------
+    model = GeoLMModel.from_pretrained(model_name)
+    tokens = torch.Tensor(neighbor_token_list).unsqueeze(0).long()
+    # input "new neighbor sentence"-> model -> output
+    outputs = model(tokens, spatial_position_list_x=torch.zeros(tokens.shape), spatial_position_list_y=torch.zeros(tokens.shape))
+    # print(outputs.last_hidden_state)
+    # print(outputs.last_hidden_state.shape)
+    targetIndex=list(range(1, len(target_token)+1))
+    # #tensor -> tensor
+    # get (1, len(target_token), 768) -> (1, 1, 768)
+    res=cutSlices(outputs.last_hidden_state, [targetIndex])
+    return res
+def cosine_similarity(target_feature, candidate_feature):
+    target_feature = target_feature.squeeze()
+    candidate_feature = candidate_feature.squeeze()
+    dot_product = torch.dot(target_feature, candidate_feature)
+    target = torch.norm(target_feature)
+    candidate = torch.norm(candidate_feature)
+    similarity = dot_product / (target * candidate)
+    return similarity.item()
+@st.cache_data
+def getCSV():
+    dataset = pd.read_csv('geohash.csv')
+    return dataset
+def showing(df):
+    m = folium.Map(location=[df['lat'].mean(), df['lon'].mean()], zoom_start=5)
+    size_scale = 100
+    color_scale = 255
+    for i in range(len(df)):
+        lat, lon, prob = df.iloc[i]['lat'], df.iloc[i]['lon'], df.iloc[i]['prob']
+        size = int(prob**2 * size_scale )
+        color = int(prob**2 * color_scale)
+        folium.CircleMarker(
+            location=[lat, lon],
+            radius=size,
+            color=f'#{color:02X}0000',
+            fill=True,
+            fill_color=f'#{color:02X}0000'
+        ).add_to(m)
+    m.save("map.html")
+    with open("map.html", "r", encoding="utf-8") as f:
+        map_html = f.read()
+    st.components.v1.html(map_html, height=600)
+def mapping(selected_place,locations, sentence_info):
+    location_index = locations.index(selected_place)
+    print(location_index)
+    df = pd.DataFrame()
+    # get same name for "Beijing" in geonames
+    same_name_embedding=search_geonames(selected_place, df)
+    sim_matrix=[]
+    print(sim_matrix)
+    same_name_embedding=torch.tensor(same_name_embedding)
+    # loop each "Beijing"
+    for i in range(same_name_embedding.size(1)):
+        print((sentence_info[:, location_index, :]).shape)
+        print((same_name_embedding[:, i, :]).shape)
+        similarities = cosine_similarity(sentence_info[:, location_index, :], same_name_embedding[:, i, :])
+        sim_matrix.append(similarities)
+    # print("Cosine Similarity Matrix:")
+    # print(sim_matrix)
+    def sigmoid(x):
+        return 1 / (1 + np.exp(-x))
+    prob_matrix = sigmoid(np.array(sim_matrix))
+    df['prob'] = prob_matrix
+    print(df)
+    showing(df)
 def show_on_map():
     input = st.text_area("Enter a sentence:", height=200)
     st.button("Submit")
+    sentence_info= MLearningFormInput(input)
+    print("sentence info: ")
+    print(sentence_info)
+    print(sentence_info.shape)
+     # input: a sentence  -> output : locations
+    locations=getLocationName(input)
+    # 1. input: a sentence  ->  output: tensor (1sentence_info
+    selected_place = st.selectbox("Select a location:", locations)
+    if selected_place is not None:
+        mapping(selected_place, locations, sentence_info)
 if __name__ == "__main__":
+    dataset = getCSV()
     show_on_map()
+    # # can be hidding.............................................................
+    # #len: 80
+    # input= 'Minneapolis, officially the City of Minneapolis, is a city in the state of Minnesota and the county seat of Hennepin County. making it the largest city in Minnesota and the 46th-most-populous in the  United States. Nicknamed the "City of Lakes", Minneapolis is abundant in water,  with thirteen lakes, wetlands, the Mississippi River, creeks, and waterfalls.'
+    # 1. input: a sentence  ->  output: tensor (1,num_locations,768)
+    # sentence_info= MLearningFormInput(input)
+    # print("sentence info: ")
+    # print(sentence_info)
+    # print(sentence_info.shape)
+    # # input: a sentence  -> output : locations
+    # locations=getLocationName(input)
+    # print(locations)
+    # j=0
+    # k=0
+    # for location in locations:
+    #     if k==0:
+    #         # input: locations -> output: search in geoname(get top 10 items) -> loop each item -> num_location x 10 x (1,1,768)
+    #         same_name_embedding=search_geonames(location)
+    #         sim_matrix=[]
+    #         print(sim_matrix)
+    #         same_name_embedding=torch.tensor(same_name_embedding)
+    #         # loop each "Beijing"
+    #         for i in range(same_name_embedding.size(1)):
+    #             # print((sentence_info[:, j, :]).shape)
+    #             # print((same_name_embedding[:, i, :]).shape)
+    #             similarities = cosine_similarity(sentence_info[:, j, :], same_name_embedding[:, i, :])
+    #             sim_matrix.append(similarities)
+    #         j=j+1
+    #         print("Cosine Similarity Matrix:")
+    #         print(sim_matrix)
+    #         k=1
+    #     else:
+    #         break