File size: 10,328 Bytes
66e2b77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe0dc3e
 
 
 
 
 
 
 
66e2b77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87a8740
66e2b77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
059c226
66e2b77
 
 
059c226
66e2b77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87a8740
 
 
66e2b77
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#import module
import streamlit as st
import pandas as pd
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules
from streamlit_agraph import agraph, Node, Edge, Config
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import sys

#===config===
st.set_page_config(
     page_title="Coconut",
     page_icon="πŸ₯₯",
     layout="wide"
)
st.header("Biderected Keywords Network")
hide_streamlit_style = """
            <style>
            #MainMenu {visibility: hidden;}
            footer {visibility: hidden;}
            </style>
            """
st.markdown(hide_streamlit_style, unsafe_allow_html=True) 

st.subheader('Put your file here...')

#===clear cache===
def reset_all():
     st.cache_data.clear()

#===check type===
@st.cache_data(ttl=3600)
def get_ext(extype):
    extype = uploaded_file.name
    return extype

@st.cache_data(ttl=3600)
def upload(extype):
    papers = pd.read_csv(uploaded_file)
    return papers

@st.cache_data(ttl=3600)
def conv_txt(extype):
    col_dict = {'TI': 'Title',
            'SO': 'Source title',
            'DT': 'Document Type',
            'DE': 'Author Keywords',
            'ID': 'Keywords Plus'}
    papers = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
    papers.rename(columns=col_dict, inplace=True)
    return papers

#===Read data===
uploaded_file = st.file_uploader("Choose a file", type=['csv', 'txt'], on_change=reset_all)

if uploaded_file is not None:
    extype = get_ext(uploaded_file)
    if extype.endswith('.csv'):
         papers = upload(extype) 
    elif extype.endswith('.txt'):
         papers = conv_txt(extype)
    
    @st.cache_data(ttl=3600)
    def get_data_arul(extype):
        list_of_column_key = list(papers.columns)
        list_of_column_key = [k for k in list_of_column_key if 'Keyword' in k]
        return papers, list_of_column_key
     
    papers, list_of_column_key = get_data_arul(extype)

    col1, col2 = st.columns(2)
    with col1:
        method = st.selectbox(
             'Choose method',
           ('Stemming', 'Lemmatization'), on_change=reset_all)
    with col2:
        keyword = st.selectbox(
            'Choose column',
           (list_of_column_key), on_change=reset_all)


    #===body=== 
    @st.cache_data(ttl=3600)
    def clean_arul(extype):
        global keyword, papers
        try:
            arul = papers.dropna(subset=[keyword])
        except KeyError:
            st.error('Error: Please check your Author/Index Keywords column.')
            sys.exit(1)
        arul[keyword] = arul[keyword].map(lambda x: re.sub('-—–', ' ', x))
        arul[keyword] = arul[keyword].map(lambda x: re.sub('; ', ' ; ', x))
        arul[keyword] = arul[keyword].map(lambda x: x.lower())
        arul[keyword] = arul[keyword].dropna()
        return arul

    arul = clean_arul(extype)   

    #===stem/lem===
    @st.cache_data(ttl=3600)
    def lemma_arul(extype):
        lemmatizer = WordNetLemmatizer()
        def lemmatize_words(text):
             words = text.split()
             words = [lemmatizer.lemmatize(word) for word in words]
             return ' '.join(words)
        arul[keyword] = arul[keyword].apply(lemmatize_words)
        return arul
    
    @st.cache_data(ttl=3600)
    def stem_arul(extype):
        stemmer = SnowballStemmer("english")
        def stem_words(text):
            words = text.split()
            words = [stemmer.stem(word) for word in words]
            return ' '.join(words)
        arul[keyword] = arul[keyword].apply(stem_words)
        return arul

    if method is 'Lemmatization':
        arul = lemma_arul(extype)
    else:
        arul = stem_arul(extype)
    
    @st.cache_data(ttl=3600)
    def arm(extype):
        arule = arul[keyword].str.split(' ; ')
        arule_list = arule.values.tolist()  
        te_ary = te.fit(arule_list).transform(arule_list)
        df = pd.DataFrame(te_ary, columns=te.columns_)
        return df
    df = arm(extype)

    col1, col2, col3 = st.columns(3)
    with col1:
        supp = st.slider(
            'Select value of Support',
            0.001, 1.000, (0.010), on_change=reset_all)
    with col2:
        conf = st.slider(
            'Select value of Confidence',
            0.001, 1.000, (0.050), on_change=reset_all)
    with col3:
        maxlen = st.slider(
            'Maximum length of the itemsets generated',
            2, 8, (2), on_change=reset_all)

    tab1, tab2, tab3 = st.tabs(["πŸ“ˆ Result & Generate visualization", "πŸ“ƒ Reference", "πŸ““ Recommended Reading"])
    
    with tab1:
        #===Association rules===
        @st.cache_data(ttl=3600)
        def freqitem(extype):
            freq_item = fpgrowth(df, min_support=supp, use_colnames=True, max_len=maxlen)
            return freq_item
        
        @st.cache_data(ttl=3600)
        def arm_table(extype):
            res = association_rules(freq_item, metric='confidence', min_threshold=conf) 
            res = res[['antecedents', 'consequents', 'antecedent support', 'consequent support', 'support', 'confidence', 'lift', 'conviction']]
            res['antecedents'] = res['antecedents'].apply(lambda x: ', '.join(list(x))).astype('unicode')
            res['consequents'] = res['consequents'].apply(lambda x: ', '.join(list(x))).astype('unicode')
            restab = res
            return res, restab

        freq_item = freqitem(extype)
        st.write('🚨 The more data you have, the longer you will have to wait.')

        if freq_item.empty:
            st.error('Please lower your value.', icon="🚨")
        else:
            res, restab = arm_table(extype)
            st.dataframe(restab, use_container_width=True)
                   
             #===visualize===
                
            if st.button('πŸ“ˆ Generate network visualization', on_click=reset_all):
                with st.spinner('Visualizing, please wait ....'): 
                     @st.cache_data(ttl=3600)
                     def map_node(extype):
                        res['to'] = res['antecedents'] + ' β†’ ' + res['consequents'] + '\n Support = ' +  res['support'].astype(str) + '\n Confidence = ' +  res['confidence'].astype(str) + '\n Conviction = ' +  res['conviction'].astype(str)
                        res_ant = res[['antecedents','antecedent support']].rename(columns={'antecedents': 'node', 'antecedent support': 'size'}) #[['antecedents','antecedent support']]
                        res_con = res[['consequents','consequent support']].rename(columns={'consequents': 'node', 'consequent support': 'size'}) #[['consequents','consequent support']]
                        res_node = pd.concat([res_ant, res_con]).drop_duplicates(keep='first')
                        return res_node, res
                     
                     res_node, res = map_node(extype)

                     @st.cache_data(ttl=3600)
                     def arul_network(extype):
                        nodes = []
                        edges = []

                        for w,x in zip(res_node['size'], res_node['node']):
                            nodes.append( Node(id=x, 
                                            label=x,
                                            size=50*w+10,
                                            shape="dot",
                                            labelHighlightBold=True,
                                            group=x,
                                            opacity=10,
                                            mass=1)
                                    )   

                        for y,z,a,b in zip(res['antecedents'],res['consequents'],res['confidence'],res['to']):
                            edges.append( Edge(source=y, 
                                            target=z,
                                            title=b,
                                            width=a*2,
                                            physics=True,
                                            smooth=True
                                            ) 
                                    )  
                        return nodes, edges

                     nodes, edges = arul_network(extype)
                     config = Config(width=1200,
                                     height=800,
                                     directed=True, 
                                     physics=True, 
                                     hierarchical=False,
                                     maxVelocity=5
                                     )

                     return_value = agraph(nodes=nodes, 
                                           edges=edges, 
                                           config=config)
    with tab2:
         st.markdown('**Santosa, F. A. (2023). Adding Perspective to the Bibliometric Mapping Using Bidirected Graph. Open Information Science, 7(1), 20220152.** https://doi.org/10.1515/opis-2022-0152')
         
    with tab3:
        st.markdown('**Agrawal, R., ImieliΕ„ski, T., & Swami, A. (1993). Mining association rules between sets of items in large databases. In ACM SIGMOD Record (Vol. 22, Issue 2, pp. 207–216). Association for Computing Machinery (ACM).** https://doi.org/10.1145/170036.170072')
        st.markdown('**Brin, S., Motwani, R., Ullman, J. D., & Tsur, S. (1997). Dynamic itemset counting and implication rules for market basket data. ACM SIGMOD Record, 26(2), 255–264.** https://doi.org/10.1145/253262.253325')
        st.markdown('**Edmonds, J., & Johnson, E. L. (2003). Matching: A Well-Solved Class of Integer Linear Programs. Combinatorial Optimization β€” Eureka, You Shrink!, 27–30.** https://doi.org/10.1007/3-540-36478-1_3') 
        st.markdown('**Li, M. (2016, August 23). An exploration to visualise the emerging trends of technology foresight based on an improved technique of co-word analysis and relevant literature data of WOS. Technology Analysis & Strategic Management, 29(6), 655–671.** https://doi.org/10.1080/09537325.2016.1220518')