faizhalas commited on
Commit
a1ba240
β€’
1 Parent(s): 0acf386

Create 6 Keywords Stem.py

Browse files
Files changed (1) hide show
  1. pages/6 Keywords Stem.py +231 -0
pages/6 Keywords Stem.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+ import nltk
6
+ nltk.download('wordnet')
7
+ from nltk.stem import WordNetLemmatizer
8
+ nltk.download('stopwords')
9
+ from nltk.corpus import stopwords
10
+ from pprint import pprint
11
+ import pickle
12
+ import streamlit.components.v1 as components
13
+ from io import StringIO
14
+ from nltk.stem.snowball import SnowballStemmer
15
+ import csv
16
+ import sys
17
+
18
+ #===config===
19
+ st.set_page_config(
20
+ page_title="Coconut",
21
+ page_icon="πŸ₯₯",
22
+ layout="wide",
23
+ initial_sidebar_state="collapsed"
24
+ )
25
+
26
+ hide_streamlit_style = """
27
+ <style>
28
+ #MainMenu
29
+ {visibility: hidden;}
30
+ footer {visibility: hidden;}
31
+ [data-testid="collapsedControl"] {display: none}
32
+ </style>
33
+ """
34
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
35
+
36
+ with st.popover("πŸ”— Menu"):
37
+ st.page_link("Home.py", label="Home", icon="🏠")
38
+ st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
39
+ st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
40
+ st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
41
+ st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
42
+ st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
43
+ st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
44
+
45
+
46
+ st.header("Keywords Stem", anchor=False)
47
+ st.subheader('Put your file here...', anchor=False)
48
+
49
+ def reset_data():
50
+ st.cache_data.clear()
51
+
52
+ #===check filetype===
53
+ @st.cache_data(ttl=3600)
54
+ def get_ext(extype):
55
+ extype = uploaded_file.name
56
+ return extype
57
+
58
+ #===upload===
59
+ @st.cache_data(ttl=3600)
60
+ def upload(extype):
61
+ keywords = pd.read_csv(uploaded_file)
62
+ return keywords
63
+
64
+ @st.cache_data(ttl=3600)
65
+ def conv_txt(extype):
66
+ col_dict = {'TI': 'Title',
67
+ 'SO': 'Source title',
68
+ 'DE': 'Author Keywords',
69
+ 'ID': 'Keywords Plus'}
70
+ keywords = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
71
+ keywords.rename(columns=col_dict, inplace=True)
72
+ return keywords
73
+
74
+ @st.cache_data(ttl=3600)
75
+ def rev_conv_txt(extype):
76
+ col_dict_rev = {'Title': 'TI',
77
+ 'Source title': 'SO',
78
+ 'Author Keywords': 'DE',
79
+ 'Keywords Plus': 'ID'}
80
+ keywords.rename(columns=col_dict_rev, inplace=True)
81
+ return keywords
82
+
83
+ @st.cache_data(ttl=3600)
84
+ def get_data(extype):
85
+ list_of_column_key = list(keywords.columns)
86
+ list_of_column_key = [k for k in list_of_column_key if 'Keyword' in k]
87
+ return list_of_column_key
88
+
89
+ uploaded_file = st.file_uploader('', type=['csv','txt'], on_change=reset_data)
90
+
91
+ if uploaded_file is not None:
92
+ extype = get_ext(uploaded_file)
93
+ if extype.endswith('.csv'):
94
+ keywords = upload(extype)
95
+
96
+ elif extype.endswith('.txt'):
97
+ keywords = conv_txt(extype)
98
+
99
+ list_of_column_key = get_data(extype)
100
+
101
+ col1, col2 = st.columns(2)
102
+ with col1:
103
+ method = st.selectbox(
104
+ 'Choose method',
105
+ ('Lemmatization', 'Stemming'), on_change=reset_data)
106
+ with col2:
107
+ keyword = st.selectbox(
108
+ 'Choose column',
109
+ (list_of_column_key), on_change=reset_data)
110
+
111
+ @st.cache_data(ttl=3600)
112
+ def clean_keyword(extype):
113
+ global keyword, keywords
114
+ try:
115
+ key = keywords[keyword]
116
+ except KeyError:
117
+ st.error('Error: Please check your Author/Index Keywords column.')
118
+ sys.exit(1)
119
+ keywords = keywords.replace(np.nan, '', regex=True)
120
+ keywords[keyword] = keywords[keyword].astype(str)
121
+ keywords[keyword] = keywords[keyword].map(lambda x: re.sub('-', ' ', x))
122
+ keywords[keyword] = keywords[keyword].map(lambda x: re.sub('; ', ' ; ', x))
123
+ keywords[keyword] = keywords[keyword].map(lambda x: x.lower())
124
+
125
+ #===Keywords list===
126
+ key = key.dropna()
127
+ key = pd.concat([key.str.split('; ', expand=True)], axis=1)
128
+ key = pd.Series(np.ravel(key)).dropna().drop_duplicates().sort_values().reset_index()
129
+ key[0] = key[0].map(lambda x: re.sub('-', ' ', x))
130
+ key['new']=key[0].map(lambda x: x.lower())
131
+
132
+ return keywords, key
133
+
134
+ #===stem/lem===
135
+ @st.cache_data(ttl=3600)
136
+ def Lemmatization(extype):
137
+ lemmatizer = WordNetLemmatizer()
138
+ def lemmatize_words(text):
139
+ words = text.split()
140
+ words = [lemmatizer.lemmatize(word) for word in words]
141
+ return ' '.join(words)
142
+ keywords[keyword] = keywords[keyword].apply(lemmatize_words)
143
+ key['new'] = key['new'].apply(lemmatize_words)
144
+ keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
145
+ return keywords, key
146
+
147
+ @st.cache_data(ttl=3600)
148
+ def Stemming(extype):
149
+ stemmer = SnowballStemmer("english")
150
+ def stem_words(text):
151
+ words = text.split()
152
+ words = [stemmer.stem(word) for word in words]
153
+ return ' '.join(words)
154
+ keywords[keyword] = keywords[keyword].apply(stem_words)
155
+ key['new'] = key['new'].apply(stem_words)
156
+ keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
157
+ return keywords, key
158
+
159
+ keywords, key = clean_keyword(extype)
160
+
161
+ if method is 'Lemmatization':
162
+ keywords, key = Lemmatization(extype)
163
+ else:
164
+ keywords, key = Stemming(extype)
165
+
166
+ st.write('Congratulations! 🀩 You choose',keyword ,'with',method,'method. Now, you can easily download the result by clicking the button below')
167
+ st.divider()
168
+
169
+ #===show & download csv===
170
+ tab1, tab2, tab3, tab4 = st.tabs(["πŸ“₯ Result", "πŸ“₯ List of Keywords", "πŸ“ƒ Reference", "πŸ“ƒ Recommended Reading"])
171
+
172
+ with tab1:
173
+ st.dataframe(keywords, use_container_width=True, hide_index=True)
174
+ @st.cache_data(ttl=3600)
175
+ def convert_df(extype):
176
+ return keywords.to_csv(index=False).encode('utf-8')
177
+
178
+ @st.cache_data(ttl=3600)
179
+ def convert_txt(extype):
180
+ return keywords.to_csv(index=False, sep='\t', lineterminator='\r').encode('utf-8')
181
+
182
+ if extype.endswith('.csv'):
183
+ csv = convert_df(extype)
184
+ st.download_button(
185
+ "Press to download result πŸ‘ˆ",
186
+ csv,
187
+ "scopus.csv",
188
+ "text/csv")
189
+
190
+ elif extype.endswith('.txt'):
191
+ keywords = rev_conv_txt(extype)
192
+ txt = convert_txt(extype)
193
+ st.download_button(
194
+ "Press to download result πŸ‘ˆ",
195
+ txt,
196
+ "savedrecs.txt",
197
+ "text/csv")
198
+
199
+ with tab2:
200
+ @st.cache_data(ttl=3600)
201
+ def table_keyword(extype):
202
+ keytab = key.drop(['index'], axis=1).rename(columns={0: 'label'})
203
+ return keytab
204
+ #===coloring the same keywords===
205
+ @st.cache_data(ttl=3600)
206
+ def highlight_cells(value):
207
+ if keytab['new'].duplicated(keep=False).any() and keytab['new'].duplicated(keep=False)[keytab['new'] == value].any():
208
+ return 'background-color: yellow'
209
+ return ''
210
+ keytab = table_keyword(extype)
211
+ st.dataframe(keytab.style.applymap(highlight_cells, subset=['new']), use_container_width=True, hide_index=True)
212
+
213
+ @st.cache_data(ttl=3600)
214
+ def convert_dfs(extype):
215
+ return key.to_csv(index=False).encode('utf-8')
216
+
217
+ csv = convert_dfs(extype)
218
+
219
+ st.download_button(
220
+ "Press to download keywords πŸ‘ˆ",
221
+ csv,
222
+ "keywords.csv",
223
+ "text/csv")
224
+
225
+ with tab3:
226
+ st.markdown('**Santosa, F. A. (2023). Prior steps into knowledge mapping: Text mining application and comparison. Issues in Science and Technology Librarianship, 102.** https://doi.org/10.29173/istl2736')
227
+
228
+ with tab4:
229
+ st.markdown('**Beri, A. (2021, January 27). Stemming vs Lemmatization. Medium.** https://towardsdatascience.com/stemming-vs-lemmatization-2daddabcb221')
230
+ st.markdown('**Khyani, D., Siddhartha B S, Niveditha N M, &amp; Divya B M. (2020). An Interpretation of Lemmatization and Stemming in Natural Language Processing. Journal of University of Shanghai for Science and Technology , 22(10), 350–357.** https://jusst.org/an-interpretation-of-lemmatization-and-stemming-in-natural-language-processing/')
231
+ st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Text Pre-Processing. Text Mining for Information Professionals, 79–103.** https://doi.org/10.1007/978-3-030-85085-2_3')