faizhalas commited on
Commit
66e2b77
β€’
1 Parent(s): c2c9bab

Create 3 Bidirected Network.py

Browse files
Files changed (1) hide show
  1. pages/3 Bidirected Network.py +236 -0
pages/3 Bidirected Network.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #import module
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import re
5
+ import nltk
6
+ nltk.download('punkt')
7
+ from nltk.tokenize import word_tokenize
8
+ from mlxtend.preprocessing import TransactionEncoder
9
+ te = TransactionEncoder()
10
+ from mlxtend.frequent_patterns import fpgrowth
11
+ from mlxtend.frequent_patterns import association_rules
12
+ from streamlit_agraph import agraph, Node, Edge, Config
13
+ import nltk
14
+ nltk.download('wordnet')
15
+ from nltk.stem import WordNetLemmatizer
16
+ nltk.download('stopwords')
17
+ from nltk.corpus import stopwords
18
+ from nltk.stem.snowball import SnowballStemmer
19
+ import sys
20
+
21
+ #===config===
22
+ st.set_page_config(
23
+ page_title="Coconut",
24
+ page_icon="πŸ₯₯",
25
+ layout="wide"
26
+ )
27
+ st.header("Biderected Keywords Network")
28
+ st.subheader('Put your file here...')
29
+
30
+ #===clear cache===
31
+ def reset_all():
32
+ st.cache_data.clear()
33
+
34
+ #===check type===
35
+ @st.cache_data(ttl=3600)
36
+ def get_ext(extype):
37
+ extype = uploaded_file.name
38
+ return extype
39
+
40
+ @st.cache_data(ttl=3600)
41
+ def upload(extype):
42
+ papers = pd.read_csv(uploaded_file)
43
+ return papers
44
+
45
+ @st.cache_data(ttl=3600)
46
+ def conv_txt(extype):
47
+ col_dict = {'TI': 'Title',
48
+ 'SO': 'Source title',
49
+ 'DT': 'Document Type',
50
+ 'DE': 'Author Keywords',
51
+ 'ID': 'Keywords Plus'}
52
+ papers = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
53
+ papers.rename(columns=col_dict, inplace=True)
54
+ return papers
55
+
56
+ #===Read data===
57
+ uploaded_file = st.file_uploader("Choose a file", type=['csv', 'txt'], on_change=reset_all)
58
+
59
+ if uploaded_file is not None:
60
+ extype = get_ext(uploaded_file)
61
+ if extype.endswith('.csv'):
62
+ papers = upload(extype)
63
+ elif extype.endswith('.txt'):
64
+ papers = conv_txt(extype)
65
+
66
+ @st.cache_data(ttl=3600)
67
+ def get_data_arul(extype):
68
+ list_of_column_key = list(papers.columns)
69
+ list_of_column_key = [k for k in list_of_column_key if 'Keyword' in k]
70
+ return papers, list_of_column_key
71
+
72
+ papers, list_of_column_key = get_data_arul(extype)
73
+
74
+ col1, col2 = st.columns(2)
75
+ with col1:
76
+ method = st.selectbox(
77
+ 'Choose method',
78
+ ('Stemming', 'Lemmatization'), on_change=reset_all)
79
+ with col2:
80
+ keyword = st.selectbox(
81
+ 'Choose column',
82
+ (list_of_column_key), on_change=reset_all)
83
+
84
+
85
+ #===body===
86
+ @st.cache_data(ttl=3600)
87
+ def clean_arul(extype):
88
+ global keyword, papers
89
+ try:
90
+ arul = papers.dropna(subset=[keyword])
91
+ except KeyError:
92
+ st.error('Error: Please check your Author/Index Keywords column.')
93
+ sys.exit(1)
94
+ arul[keyword] = arul[keyword].map(lambda x: re.sub('-—–', ' ', x))
95
+ arul[keyword] = arul[keyword].map(lambda x: re.sub('; ', ' ; ', x))
96
+ arul[keyword] = arul[keyword].map(lambda x: x.lower())
97
+ arul[keyword] = arul[keyword].dropna()
98
+ return arul
99
+
100
+ arul = clean_arul(extype)
101
+
102
+ #===stem/lem===
103
+ @st.cache_data(ttl=3600)
104
+ def lemma_arul(extype):
105
+ lemmatizer = WordNetLemmatizer()
106
+ def lemmatize_words(text):
107
+ words = text.split()
108
+ words = [lemmatizer.lemmatize(word) for word in words]
109
+ return ' '.join(words)
110
+ arul[keyword] = arul[keyword].apply(lemmatize_words)
111
+ return arul
112
+
113
+ @st.cache_data(ttl=3600)
114
+ def stem_arul(extype):
115
+ stemmer = SnowballStemmer("english")
116
+ def stem_words(text):
117
+ words = text.split()
118
+ words = [stemmer.stem(word) for word in words]
119
+ return ' '.join(words)
120
+ arul[keyword] = arul[keyword].apply(stem_words)
121
+ return arul
122
+
123
+ if method is 'Lemmatization':
124
+ arul = lemma_arul(extype)
125
+ else:
126
+ arul = stem_arul(extype)
127
+
128
+ @st.cache_data(ttl=3600)
129
+ def arm(extype):
130
+ arule = arul[keyword].str.split(' ; ')
131
+ arule_list = arule.values.tolist()
132
+ te_ary = te.fit(arule_list).transform(arule_list)
133
+ df = pd.DataFrame(te_ary, columns=te.columns_)
134
+ return df
135
+ df = arm(extype)
136
+
137
+ col1, col2, col3 = st.columns(3)
138
+ with col1:
139
+ supp = st.slider(
140
+ 'Select value of Support',
141
+ 0.001, 1.000, (0.010), on_change=reset_all)
142
+ with col2:
143
+ conf = st.slider(
144
+ 'Select value of Confidence',
145
+ 0.001, 1.000, (0.050), on_change=reset_all)
146
+ with col3:
147
+ maxlen = st.slider(
148
+ 'Maximum length of the itemsets generated',
149
+ 2, 8, (2), on_change=reset_all)
150
+
151
+ tab1, tab2 = st.tabs(["πŸ“ˆ Result & Generate visualization", "πŸ““ Recommended Reading"])
152
+
153
+ with tab1:
154
+ #===Association rules===
155
+ @st.cache_data(ttl=3600)
156
+ def freqitem(extype):
157
+ freq_item = fpgrowth(df, min_support=supp, use_colnames=True, max_len=maxlen)
158
+ return freq_item
159
+
160
+ @st.cache_data(ttl=3600)
161
+ def arm_table(extype):
162
+ res = association_rules(freq_item, metric='confidence', min_threshold=conf)
163
+ res = res[['antecedents', 'consequents', 'antecedent support', 'consequent support', 'support', 'confidence', 'lift', 'conviction']]
164
+ res['antecedents'] = res['antecedents'].apply(lambda x: ', '.join(list(x))).astype('unicode')
165
+ res['consequents'] = res['consequents'].apply(lambda x: ', '.join(list(x))).astype('unicode')
166
+ restab = res
167
+ return res, restab
168
+
169
+ freq_item = freqitem(extype)
170
+ st.write('🚨 The more data you have, the longer you will have to wait.')
171
+
172
+ if freq_item.empty:
173
+ st.error('Please lower your value.', icon="🚨")
174
+ else:
175
+ res, restab = arm_table(extype)
176
+ st.dataframe(restab, use_container_width=True)
177
+
178
+ #===visualize===
179
+
180
+ if st.button('πŸ“ˆ Generate network visualization', on_click=reset_all):
181
+ with st.spinner('Visualizing, please wait ....'):
182
+ @st.cache_data(ttl=3600)
183
+ def map_node(extype):
184
+ res['to'] = res['antecedents'] + ' β†’ ' + res['consequents'] + '\n Support = ' + res['support'].astype(str) + '\n Confidence = ' + res['confidence'].astype(str) + '\n Conviction = ' + res['conviction'].astype(str)
185
+ res_ant = res[['antecedents','antecedent support']].rename(columns={'antecedents': 'node', 'antecedent support': 'size'}) #[['antecedents','antecedent support']]
186
+ res_con = res[['consequents','consequent support']].rename(columns={'consequents': 'node', 'consequent support': 'size'}) #[['consequents','consequent support']]
187
+ res_node = pd.concat([res_ant, res_con]).drop_duplicates(keep='first')
188
+ return res_node, res
189
+
190
+ res_node, res = map_node(extype)
191
+
192
+ @st.cache_data(ttl=3600)
193
+ def arul_network(extype):
194
+ nodes = []
195
+ edges = []
196
+
197
+ for w,x in zip(res_node['size'], res_node['node']):
198
+ nodes.append( Node(id=x,
199
+ label=x,
200
+ size=50*w+10,
201
+ shape="circularImage",
202
+ labelHighlightBold=True,
203
+ group=x,
204
+ opacity=10,
205
+ mass=1,
206
+ image="https://upload.wikimedia.org/wikipedia/commons/f/f1/Eo_circle_yellow_circle.svg")
207
+ )
208
+
209
+ for y,z,a,b in zip(res['antecedents'],res['consequents'],res['confidence'],res['to']):
210
+ edges.append( Edge(source=y,
211
+ target=z,
212
+ title=b,
213
+ width=a*2,
214
+ physics=True,
215
+ smooth=True
216
+ )
217
+ )
218
+ return nodes, edges
219
+
220
+ nodes, edges = arul_network(extype)
221
+ config = Config(width=1200,
222
+ height=800,
223
+ directed=True,
224
+ physics=True,
225
+ hierarchical=False,
226
+ maxVelocity=5
227
+ )
228
+
229
+ return_value = agraph(nodes=nodes,
230
+ edges=edges,
231
+ config=config)
232
+ with tab2:
233
+ st.markdown('**Agrawal, R., ImieliΕ„ski, T., & Swami, A. (1993). Mining association rules between sets of items in large databases. In ACM SIGMOD Record (Vol. 22, Issue 2, pp. 207–216). Association for Computing Machinery (ACM).** https://doi.org/10.1145/170036.170072')
234
+ st.markdown('**Brin, S., Motwani, R., Ullman, J. D., & Tsur, S. (1997). Dynamic itemset counting and implication rules for market basket data. ACM SIGMOD Record, 26(2), 255–264.** https://doi.org/10.1145/253262.253325')
235
+ st.markdown('**Edmonds, J., & Johnson, E. L. (2003). Matching: A Well-Solved Class of Integer Linear Programs. Combinatorial Optimization β€” Eureka, You Shrink!, 27–30.** https://doi.org/10.1007/3-540-36478-1_3')
236
+ st.markdown('**Li, M. (2016, August 23). An exploration to visualise the emerging trends of technology foresight based on an improved technique of co-word analysis and relevant literature data of WOS. Technology Analysis & Strategic Management, 29(6), 655–671.** https://doi.org/10.1080/09537325.2016.1220518')