mylibrar commited on
Commit
d40cd38
1 Parent(s): ac7d8cf

Add topic analysis

Browse files
Files changed (3) hide show
  1. data/topic_charts.json +0 -0
  2. main.py +8 -0
  3. results.py +52 -0
data/topic_charts.json ADDED
The diff for this file is too large to render. See raw diff
 
main.py CHANGED
@@ -352,6 +352,14 @@ def main():
352
  href="#section53",
353
  )
354
  ),
 
 
 
 
 
 
 
 
355
  ),
356
  ),
357
  role="navigation",
 
352
  href="#section53",
353
  )
354
  ),
355
+ Li(
356
+ A(
357
+ "Topic Analysis",
358
+ href="/results#section5",
359
+ hx_get="/results#section5",
360
+ hx_target="#inner-text",
361
+ )
362
+ )
363
  ),
364
  ),
365
  role="navigation",
results.py CHANGED
@@ -839,6 +839,7 @@ intro_div = Div(
839
  Ul(
840
  Li("The Learning Curve of TxT360 with an Upsampling Recipe", style = "margin-bottom: 5px"),
841
  Li("Perplexity Analysis across time", style = "margin-bottom: 5px"),
 
842
  ),
843
  )
844
 
@@ -974,6 +975,53 @@ llama_div = Div(
974
  ),
975
  )
976
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
977
 
978
  def results():
979
  return Div(
@@ -995,6 +1043,10 @@ def results():
995
  ),
996
  Section(
997
  llama_div,
 
 
 
 
998
  ),
999
  id="inner-text"
1000
  )
 
839
  Ul(
840
  Li("The Learning Curve of TxT360 with an Upsampling Recipe", style = "margin-bottom: 5px"),
841
  Li("Perplexity Analysis across time", style = "margin-bottom: 5px"),
842
+ Li("Topic Analysis on Data Cluster Groups", style = "margin-bottom: 5px"),
843
  ),
844
  )
845
 
 
975
  ),
976
  )
977
 
978
+ with open(os.path.join(os.path.dirname(__file__), "data", "topic_charts.json"), 'r') as f:
979
+ topic_charts = json.load(f)
980
+ topic_graphs = {}
981
+
982
+ for title, data in topic_charts.items():
983
+ if data["type"] == "barh":
984
+ topic_graphs[title] = go.Figure(go.Bar(
985
+ x=data["kwargs"]["width"],
986
+ y=data["kwargs"]['y'],
987
+ orientation='h',
988
+ marker_color=[
989
+ "rgb(" + ", ".join(str(val * 255) for val in rgb) + ')'
990
+ for rgb in data["kwargs"]["color"]
991
+ ]
992
+ ))
993
+ else:
994
+ topic_count_graph = go.Figure(go.Pie(
995
+ values=data["kwargs"]['x'],
996
+ labels=data["kwargs"]["labels"],
997
+ marker_colors=[
998
+ "rgb(" + ", ".join(str(val * 255) for val in rgb) + ')'
999
+ for rgb in data["kwargs"]["colors"]
1000
+ ]
1001
+ ))
1002
+
1003
+ cluster_div = Div(
1004
+ Section(
1005
+ H2("Topic Analysis"),
1006
+ P("We tried to classify data into topic groups and looked for correlations between topics and statistics of data. We believe that different groups should manifest different characteristics of distribution, which could give us some insight into the composition of dataset."),
1007
+ H3("Methodology"),
1008
+ P("We took the ", A("common crawl", href="https://commoncrawl.org/"), " data and clustered them into 17 topic groups using ", A("BERTopic", href="https://maartengr.github.io/BERTopic/index.html"), ". For each group, we collected and aggregated a series of metrics and calculated average scores for different quality signals and metadata."),
1009
+ H3("Cluster Groups"),
1010
+ P("We grouped data into the following 17 clusters"),
1011
+ Ul(*(
1012
+ Li(topic_name, style = "margin-bottom: 5px")
1013
+ for topic_name in ("Arts", "Business & Economics & Finance", "Culture & Cultural geography", "Daily Life & Home & Lifestyle", "Education", "Entertainment & Travel & Hobby", "Environment", "Food & Drink & Cooking", "Health & Wellness & Medicine", "Law & Justice", "Natural Science & Formal Science & Technology", "Personal Development & Human Resources & Career", "Politics & Government", "Religion & Spirituality", "Shopping & Commodity", "Society & Social Issues & Human Rights", "Sports")
1014
+ )),
1015
+ H3("Results Analysis"),
1016
+ H3("Number of document of each topic"),
1017
+ plotly2fasthtml(topic_count_graph),
1018
+ *(
1019
+ Section(H3(title), plotly2fasthtml(graph))
1020
+ for title, graph in topic_graphs.items()
1021
+ )
1022
+ )
1023
+ )
1024
+
1025
 
1026
  def results():
1027
  return Div(
 
1043
  ),
1044
  Section(
1045
  llama_div,
1046
+ ),
1047
+ Section(
1048
+ cluster_div,
1049
+ id="section5"
1050
  ),
1051
  id="inner-text"
1052
  )