faizhalas commited on
Commit
582dc17
β€’
1 Parent(s): 19b9cbc

Update pages/6 Keywords Stem.py

Browse files
Files changed (1) hide show
  1. pages/6 Keywords Stem.py +140 -133
pages/6 Keywords Stem.py CHANGED
@@ -89,143 +89,150 @@ def get_data(extype):
89
  uploaded_file = st.file_uploader('', type=['csv','txt'], on_change=reset_data)
90
 
91
  if uploaded_file is not None:
92
- extype = get_ext(uploaded_file)
93
- if extype.endswith('.csv'):
94
- keywords = upload(extype)
95
-
96
- elif extype.endswith('.txt'):
97
- keywords = conv_txt(extype)
98
-
99
- list_of_column_key = get_data(extype)
100
-
101
- col1, col2 = st.columns(2)
102
- with col1:
103
- method = st.selectbox(
104
- 'Choose method',
105
- ('Lemmatization', 'Stemming'), on_change=reset_data)
106
- with col2:
107
- keyword = st.selectbox(
108
- 'Choose column',
109
- (list_of_column_key), on_change=reset_data)
110
-
111
- @st.cache_data(ttl=3600)
112
- def clean_keyword(extype):
113
- global keyword, keywords
114
- try:
115
- key = keywords[keyword]
116
- except KeyError:
117
- st.error('Error: Please check your Author/Index Keywords column.')
118
- sys.exit(1)
119
- keywords = keywords.replace(np.nan, '', regex=True)
120
- keywords[keyword] = keywords[keyword].astype(str)
121
- keywords[keyword] = keywords[keyword].map(lambda x: re.sub('-', ' ', x))
122
- keywords[keyword] = keywords[keyword].map(lambda x: re.sub('; ', ' ; ', x))
123
- keywords[keyword] = keywords[keyword].map(lambda x: x.lower())
124
-
125
- #===Keywords list===
126
- key = key.dropna()
127
- key = pd.concat([key.str.split('; ', expand=True)], axis=1)
128
- key = pd.Series(np.ravel(key)).dropna().drop_duplicates().sort_values().reset_index()
129
- key[0] = key[0].map(lambda x: re.sub('-', ' ', x))
130
- key['new']=key[0].map(lambda x: x.lower())
131
-
132
- return keywords, key
133
-
134
- #===stem/lem===
135
- @st.cache_data(ttl=3600)
136
- def Lemmatization(extype):
137
- lemmatizer = WordNetLemmatizer()
138
- def lemmatize_words(text):
139
- words = text.split()
140
- words = [lemmatizer.lemmatize(word) for word in words]
141
- return ' '.join(words)
142
- keywords[keyword] = keywords[keyword].apply(lemmatize_words)
143
- key['new'] = key['new'].apply(lemmatize_words)
144
- keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
145
- return keywords, key
146
-
147
- @st.cache_data(ttl=3600)
148
- def Stemming(extype):
149
- stemmer = SnowballStemmer("english")
150
- def stem_words(text):
151
- words = text.split()
152
- words = [stemmer.stem(word) for word in words]
153
- return ' '.join(words)
154
- keywords[keyword] = keywords[keyword].apply(stem_words)
155
- key['new'] = key['new'].apply(stem_words)
156
- keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
157
- return keywords, key
158
-
159
- keywords, key = clean_keyword(extype)
160
-
161
- if method is 'Lemmatization':
162
- keywords, key = Lemmatization(extype)
163
- else:
164
- keywords, key = Stemming(extype)
165
 
166
- st.write('Congratulations! 🀩 You choose',keyword ,'with',method,'method. Now, you can easily download the result by clicking the button below')
167
- st.divider()
168
-
169
- #===show & download csv===
170
- tab1, tab2, tab3, tab4 = st.tabs(["πŸ“₯ Result", "πŸ“₯ List of Keywords", "πŸ“ƒ Reference", "πŸ“ƒ Recommended Reading"])
171
-
172
- with tab1:
173
- st.dataframe(keywords, use_container_width=True, hide_index=True)
174
- @st.cache_data(ttl=3600)
175
- def convert_df(extype):
176
- return keywords.to_csv(index=False).encode('utf-8')
177
 
178
- @st.cache_data(ttl=3600)
179
- def convert_txt(extype):
180
- return keywords.to_csv(index=False, sep='\t', lineterminator='\r').encode('utf-8')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
- if extype.endswith('.csv'):
183
- csv = convert_df(extype)
184
- st.download_button(
185
- "Press to download result πŸ‘ˆ",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  csv,
187
- "scopus.csv",
188
  "text/csv")
189
-
190
- elif extype.endswith('.txt'):
191
- keywords = rev_conv_txt(extype)
192
- txt = convert_txt(extype)
193
- st.download_button(
194
- "Press to download result πŸ‘ˆ",
195
- txt,
196
- "savedrecs.txt",
197
- "text/csv")
198
 
199
- with tab2:
200
- @st.cache_data(ttl=3600)
201
- def table_keyword(extype):
202
- keytab = key.drop(['index'], axis=1).rename(columns={0: 'label'})
203
- return keytab
204
- #===coloring the same keywords===
205
- @st.cache_data(ttl=3600)
206
- def highlight_cells(value):
207
- if keytab['new'].duplicated(keep=False).any() and keytab['new'].duplicated(keep=False)[keytab['new'] == value].any():
208
- return 'background-color: yellow'
209
- return ''
210
- keytab = table_keyword(extype)
211
- st.dataframe(keytab.style.applymap(highlight_cells, subset=['new']), use_container_width=True, hide_index=True)
212
-
213
- @st.cache_data(ttl=3600)
214
- def convert_dfs(extype):
215
- return key.to_csv(index=False).encode('utf-8')
216
-
217
- csv = convert_dfs(extype)
218
 
219
- st.download_button(
220
- "Press to download keywords πŸ‘ˆ",
221
- csv,
222
- "keywords.csv",
223
- "text/csv")
224
-
225
- with tab3:
226
- st.markdown('**Santosa, F. A. (2023). Prior steps into knowledge mapping: Text mining application and comparison. Issues in Science and Technology Librarianship, 102.** https://doi.org/10.29173/istl2736')
227
-
228
- with tab4:
229
- st.markdown('**Beri, A. (2021, January 27). Stemming vs Lemmatization. Medium.** https://towardsdatascience.com/stemming-vs-lemmatization-2daddabcb221')
230
- st.markdown('**Khyani, D., Siddhartha B S, Niveditha N M, & Divya B M. (2020). An Interpretation of Lemmatization and Stemming in Natural Language Processing. Journal of University of Shanghai for Science and Technology , 22(10), 350–357.** https://jusst.org/an-interpretation-of-lemmatization-and-stemming-in-natural-language-processing/')
231
- st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Text Pre-Processing. Text Mining for Information Professionals, 79–103.** https://doi.org/10.1007/978-3-030-85085-2_3')
 
89
  uploaded_file = st.file_uploader('', type=['csv','txt'], on_change=reset_data)
90
 
91
  if uploaded_file is not None:
92
+ try:
93
+ extype = get_ext(uploaded_file)
94
+ if extype.endswith('.csv'):
95
+ keywords = upload(extype)
96
+
97
+ elif extype.endswith('.txt'):
98
+ keywords = conv_txt(extype)
99
+
100
+ list_of_column_key = get_data(extype)
101
+
102
+ col1, col2 = st.columns(2)
103
+ with col1:
104
+ method = st.selectbox(
105
+ 'Choose method',
106
+ ('Lemmatization', 'Stemming'), on_change=reset_data)
107
+ with col2:
108
+ keyword = st.selectbox(
109
+ 'Choose column',
110
+ (list_of_column_key), on_change=reset_data)
111
+
112
+ @st.cache_data(ttl=3600)
113
+ def clean_keyword(extype):
114
+ global keyword, keywords
115
+ try:
116
+ key = keywords[keyword]
117
+ except KeyError:
118
+ st.error('Error: Please check your Author/Index Keywords column.')
119
+ sys.exit(1)
120
+ keywords = keywords.replace(np.nan, '', regex=True)
121
+ keywords[keyword] = keywords[keyword].astype(str)
122
+ keywords[keyword] = keywords[keyword].map(lambda x: re.sub('-', ' ', x))
123
+ keywords[keyword] = keywords[keyword].map(lambda x: re.sub('; ', ' ; ', x))
124
+ keywords[keyword] = keywords[keyword].map(lambda x: x.lower())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
+ #===Keywords list===
127
+ key = key.dropna()
128
+ key = pd.concat([key.str.split('; ', expand=True)], axis=1)
129
+ key = pd.Series(np.ravel(key)).dropna().drop_duplicates().sort_values().reset_index()
130
+ key[0] = key[0].map(lambda x: re.sub('-', ' ', x))
131
+ key['new']=key[0].map(lambda x: x.lower())
132
+
133
+ return keywords, key
 
 
 
134
 
135
+ #===stem/lem===
136
+ @st.cache_data(ttl=3600)
137
+ def Lemmatization(extype):
138
+ lemmatizer = WordNetLemmatizer()
139
+ def lemmatize_words(text):
140
+ words = text.split()
141
+ words = [lemmatizer.lemmatize(word) for word in words]
142
+ return ' '.join(words)
143
+ keywords[keyword] = keywords[keyword].apply(lemmatize_words)
144
+ key['new'] = key['new'].apply(lemmatize_words)
145
+ keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
146
+ return keywords, key
147
+
148
+ @st.cache_data(ttl=3600)
149
+ def Stemming(extype):
150
+ stemmer = SnowballStemmer("english")
151
+ def stem_words(text):
152
+ words = text.split()
153
+ words = [stemmer.stem(word) for word in words]
154
+ return ' '.join(words)
155
+ keywords[keyword] = keywords[keyword].apply(stem_words)
156
+ key['new'] = key['new'].apply(stem_words)
157
+ keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
158
+ return keywords, key
159
 
160
+ keywords, key = clean_keyword(extype)
161
+
162
+ if method is 'Lemmatization':
163
+ keywords, key = Lemmatization(extype)
164
+ else:
165
+ keywords, key = Stemming(extype)
166
+
167
+ st.write('Congratulations! 🀩 You choose',keyword ,'with',method,'method. Now, you can easily download the result by clicking the button below')
168
+ st.divider()
169
+
170
+ #===show & download csv===
171
+ tab1, tab2, tab3, tab4 = st.tabs(["πŸ“₯ Result", "πŸ“₯ List of Keywords", "πŸ“ƒ Reference", "πŸ“ƒ Recommended Reading"])
172
+
173
+ with tab1:
174
+ st.dataframe(keywords, use_container_width=True, hide_index=True)
175
+ @st.cache_data(ttl=3600)
176
+ def convert_df(extype):
177
+ return keywords.to_csv(index=False).encode('utf-8')
178
+
179
+ @st.cache_data(ttl=3600)
180
+ def convert_txt(extype):
181
+ return keywords.to_csv(index=False, sep='\t', lineterminator='\r').encode('utf-8')
182
+
183
+ if extype.endswith('.csv'):
184
+ csv = convert_df(extype)
185
+ st.download_button(
186
+ "Press to download result πŸ‘ˆ",
187
+ csv,
188
+ "scopus.csv",
189
+ "text/csv")
190
+
191
+ elif extype.endswith('.txt'):
192
+ keywords = rev_conv_txt(extype)
193
+ txt = convert_txt(extype)
194
+ st.download_button(
195
+ "Press to download result πŸ‘ˆ",
196
+ txt,
197
+ "savedrecs.txt",
198
+ "text/csv")
199
+
200
+ with tab2:
201
+ @st.cache_data(ttl=3600)
202
+ def table_keyword(extype):
203
+ keytab = key.drop(['index'], axis=1).rename(columns={0: 'label'})
204
+ return keytab
205
+
206
+ #===coloring the same keywords===
207
+ @st.cache_data(ttl=3600)
208
+ def highlight_cells(value):
209
+ if keytab['new'].duplicated(keep=False).any() and keytab['new'].duplicated(keep=False)[keytab['new'] == value].any():
210
+ return 'background-color: yellow'
211
+ return ''
212
+ keytab = table_keyword(extype)
213
+ st.dataframe(keytab.style.applymap(highlight_cells, subset=['new']), use_container_width=True, hide_index=True)
214
+
215
+ @st.cache_data(ttl=3600)
216
+ def convert_dfs(extype):
217
+ return key.to_csv(index=False).encode('utf-8')
218
+
219
+ csv = convert_dfs(extype)
220
+
221
+ st.download_button(
222
+ "Press to download keywords πŸ‘ˆ",
223
  csv,
224
+ "keywords.csv",
225
  "text/csv")
226
+
227
+ with tab3:
228
+ st.markdown('**Santosa, F. A. (2023). Prior steps into knowledge mapping: Text mining application and comparison. Issues in Science and Technology Librarianship, 102.** https://doi.org/10.29173/istl2736')
 
 
 
 
 
 
229
 
230
+ with tab4:
231
+ st.markdown('**Beri, A. (2021, January 27). Stemming vs Lemmatization. Medium.** https://towardsdatascience.com/stemming-vs-lemmatization-2daddabcb221')
232
+ st.markdown('**Khyani, D., Siddhartha B S, Niveditha N M, & Divya B M. (2020). An Interpretation of Lemmatization and Stemming in Natural Language Processing. Journal of University of Shanghai for Science and Technology , 22(10), 350–357.** https://jusst.org/an-interpretation-of-lemmatization-and-stemming-in-natural-language-processing/')
233
+ st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Text Pre-Processing. Text Mining for Information Professionals, 79–103.** https://doi.org/10.1007/978-3-030-85085-2_3')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
+
236
+ except:
237
+ st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
238
+ st.stop()