faizhalas commited on
Commit
e951f20
β€’
1 Parent(s): 1e90abc

Create pages/1 Keywords Stem.py

Browse files
Files changed (1) hide show
  1. pages/1 Keywords Stem.py +203 -0
pages/1 Keywords Stem.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+ import nltk
6
+ nltk.download('wordnet')
7
+ from nltk.stem import WordNetLemmatizer
8
+ nltk.download('stopwords')
9
+ from nltk.corpus import stopwords
10
+ from pprint import pprint
11
+ import pickle
12
+ import streamlit.components.v1 as components
13
+ from io import StringIO
14
+ from nltk.stem.snowball import SnowballStemmer
15
+ import csv
16
+ import sys
17
+
18
+ #===config===
19
+ st.set_page_config(
20
+ page_title="Coconut",
21
+ page_icon="πŸ₯₯",
22
+ layout="wide"
23
+ )
24
+ st.header("Keywords Stem")
25
+ st.subheader('Put your file here...')
26
+
27
+ def reset_data():
28
+ st.cache_data.clear()
29
+
30
+ #===check filetype===
31
+ @st.cache_data(ttl=3600)
32
+ def get_ext(extype):
33
+ extype = uploaded_file.name
34
+ return extype
35
+
36
+ #===upload===
37
+ @st.cache_data(ttl=3600)
38
+ def upload(extype):
39
+ keywords = pd.read_csv(uploaded_file)
40
+ return keywords
41
+
42
+ @st.cache_data(ttl=3600)
43
+ def conv_txt(extype):
44
+ col_dict = {'TI': 'Title',
45
+ 'SO': 'Source title',
46
+ 'DE': 'Author Keywords',
47
+ 'ID': 'Keywords Plus'}
48
+ keywords = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
49
+ keywords.rename(columns=col_dict, inplace=True)
50
+ return keywords
51
+
52
+ @st.cache_data(ttl=3600)
53
+ def rev_conv_txt(extype):
54
+ col_dict_rev = {'Title': 'TI',
55
+ 'Source title': 'SO',
56
+ 'Author Keywords': 'DE',
57
+ 'Keywords Plus': 'ID'}
58
+ keywords.rename(columns=col_dict_rev, inplace=True)
59
+ return keywords
60
+
61
+ @st.cache_data(ttl=3600)
62
+ def get_data(extype):
63
+ list_of_column_key = list(keywords.columns)
64
+ list_of_column_key = [k for k in list_of_column_key if 'Keyword' in k]
65
+ return list_of_column_key
66
+
67
+ uploaded_file = st.file_uploader("Choose your a file", type=['csv','txt'], on_change=reset_data)
68
+
69
+ if uploaded_file is not None:
70
+ extype = get_ext(uploaded_file)
71
+ if extype.endswith('.csv'):
72
+ keywords = upload(extype)
73
+
74
+ elif extype.endswith('.txt'):
75
+ keywords = conv_txt(extype)
76
+
77
+ list_of_column_key = get_data(extype)
78
+
79
+ col1, col2 = st.columns(2)
80
+ with col1:
81
+ method = st.selectbox(
82
+ 'Choose method',
83
+ ('Stemming', 'Lemmatization'), on_change=reset_data)
84
+ with col2:
85
+ keyword = st.selectbox(
86
+ 'Choose column',
87
+ (list_of_column_key), on_change=reset_data)
88
+
89
+ @st.cache_data(ttl=3600)
90
+ def clean_keyword(extype):
91
+ global keyword, keywords
92
+ try:
93
+ key = keywords[keyword]
94
+ except KeyError:
95
+ st.error('Error: Please check your Author/Index Keywords column.')
96
+ sys.exit(1)
97
+ keywords = keywords.replace(np.nan, '', regex=True)
98
+ keywords[keyword] = keywords[keyword].astype(str)
99
+ keywords[keyword] = keywords[keyword].map(lambda x: re.sub('-', ' ', x))
100
+ keywords[keyword] = keywords[keyword].map(lambda x: re.sub('; ', ' ; ', x))
101
+ keywords[keyword] = keywords[keyword].map(lambda x: x.lower())
102
+
103
+ #===Keywords list===
104
+ key = key.dropna()
105
+ key = pd.concat([key.str.split('; ', expand=True)], axis=1)
106
+ key = pd.Series(np.ravel(key)).dropna().drop_duplicates().sort_values().reset_index()
107
+ key[0] = key[0].map(lambda x: re.sub('-', ' ', x))
108
+ key['new']=key[0].map(lambda x: x.lower())
109
+
110
+ return keywords, key
111
+
112
+ #===stem/lem===
113
+ @st.cache_data(ttl=3600)
114
+ def Lemmatization(extype):
115
+ lemmatizer = WordNetLemmatizer()
116
+ def lemmatize_words(text):
117
+ words = text.split()
118
+ words = [lemmatizer.lemmatize(word) for word in words]
119
+ return ' '.join(words)
120
+ keywords[keyword] = keywords[keyword].apply(lemmatize_words)
121
+ key['new'] = key['new'].apply(lemmatize_words)
122
+ keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
123
+ return keywords, key
124
+
125
+ @st.cache_data(ttl=3600)
126
+ def Stemming(extype):
127
+ stemmer = SnowballStemmer("english")
128
+ def stem_words(text):
129
+ words = text.split()
130
+ words = [stemmer.stem(word) for word in words]
131
+ return ' '.join(words)
132
+ keywords[keyword] = keywords[keyword].apply(stem_words)
133
+ key['new'] = key['new'].apply(stem_words)
134
+ keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
135
+ return keywords, key
136
+
137
+ keywords, key = clean_keyword(extype)
138
+
139
+ if method is 'Lemmatization':
140
+ keywords, key = Lemmatization(extype)
141
+ else:
142
+ keywords, key = Stemming(extype)
143
+
144
+ st.write('Congratulations! 🀩 You choose',keyword ,'with',method,'method. Now, you can easily download the result by clicking the button below')
145
+ st.divider()
146
+
147
+ #===show & download csv===
148
+ tab1, tab2, tab3, tab4 = st.tabs(["πŸ“₯ Result", "πŸ“₯ List of Keywords", "πŸ“ƒ Reference", "πŸ“ƒ Recommended Reading"])
149
+
150
+ with tab1:
151
+ st.dataframe(keywords, use_container_width=True)
152
+ @st.cache_data(ttl=3600)
153
+ def convert_df(extype):
154
+ return keywords.to_csv(index=False).encode('utf-8')
155
+
156
+ @st.cache_data(ttl=3600)
157
+ def convert_txt(extype):
158
+ return keywords.to_csv(index=False, sep='\t', lineterminator='\r').encode('utf-8')
159
+
160
+ if extype.endswith('.csv'):
161
+ csv = convert_df(extype)
162
+ st.download_button(
163
+ "Press to download result πŸ‘ˆ",
164
+ csv,
165
+ "scopus.csv",
166
+ "text/csv")
167
+
168
+ elif extype.endswith('.txt'):
169
+ keywords = rev_conv_txt(extype)
170
+ txt = convert_txt(extype)
171
+ st.download_button(
172
+ "Press to download result πŸ‘ˆ",
173
+ txt,
174
+ "savedrecs.txt",
175
+ "text/csv")
176
+
177
+ with tab2:
178
+ @st.cache_data(ttl=3600)
179
+ def table_keyword(extype):
180
+ keytab = key.drop(['index'], axis=1).rename(columns={0: 'old'})
181
+ return keytab
182
+ keytab = table_keyword(extype)
183
+ st.dataframe(keytab, use_container_width=True)
184
+
185
+ @st.cache_data(ttl=3600)
186
+ def convert_dfs(extype):
187
+ return key.to_csv(index=False).encode('utf-8')
188
+
189
+ csv = convert_dfs(extype)
190
+
191
+ st.download_button(
192
+ "Press to download keywords πŸ‘ˆ",
193
+ csv,
194
+ "keywords.csv",
195
+ "text/csv")
196
+
197
+ with tab3:
198
+ st.markdown('**Santosa, F. A. (2022). Prior steps into knowledge mapping: Text mining application and comparison. Issues in Science and Technology Librarianship, 102.** https://doi.org/10.29173/istl2736')
199
+
200
+ with tab4:
201
+ st.markdown('**Beri, A. (2021, January 27). Stemming vs Lemmatization. Medium.** https://towardsdatascience.com/stemming-vs-lemmatization-2daddabcb221')
202
+ st.markdown('**Khyani, D., Siddhartha B S, Niveditha N M, & Divya B M. (2020). An Interpretation of Lemmatization and Stemming in Natural Language Processing. Journal of University of Shanghai for Science and Technology , 22(10), 350–357.** https://jusst.org/an-interpretation-of-lemmatization-and-stemming-in-natural-language-processing/')
203
+ st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Text Pre-Processing. Text Mining for Information Professionals, 79–103.** https://doi.org/10.1007/978-3-030-85085-2_3')