mylibrar commited on
Commit
cb27b88
1 Parent(s): d40cd38

Fix issues in topic analysis

Browse files
Files changed (3) hide show
  1. data/topic_charts.json +0 -0
  2. main.py +1 -3
  3. results.py +11 -13
data/topic_charts.json CHANGED
The diff for this file is too large to render. See raw diff
 
main.py CHANGED
@@ -355,9 +355,7 @@ def main():
355
  Li(
356
  A(
357
  "Topic Analysis",
358
- href="/results#section5",
359
- hx_get="/results#section5",
360
- hx_target="#inner-text",
361
  )
362
  )
363
  ),
 
355
  Li(
356
  A(
357
  "Topic Analysis",
358
+ href="#section55",
 
 
359
  )
360
  )
361
  ),
results.py CHANGED
@@ -977,11 +977,11 @@ llama_div = Div(
977
 
978
  with open(os.path.join(os.path.dirname(__file__), "data", "topic_charts.json"), 'r') as f:
979
  topic_charts = json.load(f)
980
- topic_graphs = {}
981
 
982
- for title, data in topic_charts.items():
983
  if data["type"] == "barh":
984
- topic_graphs[title] = go.Figure(go.Bar(
985
  x=data["kwargs"]["width"],
986
  y=data["kwargs"]['y'],
987
  orientation='h',
@@ -989,23 +989,23 @@ for title, data in topic_charts.items():
989
  "rgb(" + ", ".join(str(val * 255) for val in rgb) + ')'
990
  for rgb in data["kwargs"]["color"]
991
  ]
992
- ))
993
  else:
994
- topic_count_graph = go.Figure(go.Pie(
995
  values=data["kwargs"]['x'],
996
  labels=data["kwargs"]["labels"],
997
  marker_colors=[
998
  "rgb(" + ", ".join(str(val * 255) for val in rgb) + ')'
999
  for rgb in data["kwargs"]["colors"]
1000
  ]
1001
- ))
1002
 
1003
  cluster_div = Div(
1004
  Section(
1005
  H2("Topic Analysis"),
1006
- P("We tried to classify data into topic groups and looked for correlations between topics and statistics of data. We believe that different groups should manifest different characteristics of distribution, which could give us some insight into the composition of dataset."),
1007
  H3("Methodology"),
1008
- P("We took the ", A("common crawl", href="https://commoncrawl.org/"), " data and clustered them into 17 topic groups using ", A("BERTopic", href="https://maartengr.github.io/BERTopic/index.html"), ". For each group, we collected and aggregated a series of metrics and calculated average scores for different quality signals and metadata."),
1009
  H3("Cluster Groups"),
1010
  P("We grouped data into the following 17 clusters"),
1011
  Ul(*(
@@ -1013,11 +1013,9 @@ cluster_div = Div(
1013
  for topic_name in ("Arts", "Business & Economics & Finance", "Culture & Cultural geography", "Daily Life & Home & Lifestyle", "Education", "Entertainment & Travel & Hobby", "Environment", "Food & Drink & Cooking", "Health & Wellness & Medicine", "Law & Justice", "Natural Science & Formal Science & Technology", "Personal Development & Human Resources & Career", "Politics & Government", "Religion & Spirituality", "Shopping & Commodity", "Society & Social Issues & Human Rights", "Sports")
1014
  )),
1015
  H3("Results Analysis"),
1016
- H3("Number of document of each topic"),
1017
- plotly2fasthtml(topic_count_graph),
1018
  *(
1019
- Section(H3(title), plotly2fasthtml(graph))
1020
- for title, graph in topic_graphs.items()
1021
  )
1022
  )
1023
  )
@@ -1046,7 +1044,7 @@ def results():
1046
  ),
1047
  Section(
1048
  cluster_div,
1049
- id="section5"
1050
  ),
1051
  id="inner-text"
1052
  )
 
977
 
978
  with open(os.path.join(os.path.dirname(__file__), "data", "topic_charts.json"), 'r') as f:
979
  topic_charts = json.load(f)
980
+ topic_graphs = []
981
 
982
+ for title, data in topic_charts:
983
  if data["type"] == "barh":
984
+ topic_graphs.append(go.Figure(go.Bar(
985
  x=data["kwargs"]["width"],
986
  y=data["kwargs"]['y'],
987
  orientation='h',
 
989
  "rgb(" + ", ".join(str(val * 255) for val in rgb) + ')'
990
  for rgb in data["kwargs"]["color"]
991
  ]
992
+ )))
993
  else:
994
+ topic_graphs.append(go.Figure(go.Pie(
995
  values=data["kwargs"]['x'],
996
  labels=data["kwargs"]["labels"],
997
  marker_colors=[
998
  "rgb(" + ", ".join(str(val * 255) for val in rgb) + ')'
999
  for rgb in data["kwargs"]["colors"]
1000
  ]
1001
+ )))
1002
 
1003
  cluster_div = Div(
1004
  Section(
1005
  H2("Topic Analysis"),
1006
+ P("We tried to classify data into topic groups and looked for correlations between topics and statistics of data. Data from different topic groups should manifest different characteristics of distribution, which can give us some insight into the composition of dataset."),
1007
  H3("Methodology"),
1008
+ P("We took the ", A("common crawl", href="https://commoncrawl.org/"), " data and clustered them into 17 topic groups using ", A("BERTopic", href="https://maartengr.github.io/BERTopic/index.html"), ". We collected and aggregated a series of metrics which include quality signals and other useful metadata. For each topic group, we calculated average scores and generated the corresponding bar charts over different metrics for comparison and analysis."),
1009
  H3("Cluster Groups"),
1010
  P("We grouped data into the following 17 clusters"),
1011
  Ul(*(
 
1013
  for topic_name in ("Arts", "Business & Economics & Finance", "Culture & Cultural geography", "Daily Life & Home & Lifestyle", "Education", "Entertainment & Travel & Hobby", "Environment", "Food & Drink & Cooking", "Health & Wellness & Medicine", "Law & Justice", "Natural Science & Formal Science & Technology", "Personal Development & Human Resources & Career", "Politics & Government", "Religion & Spirituality", "Shopping & Commodity", "Society & Social Issues & Human Rights", "Sports")
1014
  )),
1015
  H3("Results Analysis"),
 
 
1016
  *(
1017
+ Section(H4(title), plotly2fasthtml(topic_graphs[i]))
1018
+ for i, (title, _) in enumerate(topic_charts)
1019
  )
1020
  )
1021
  )
 
1044
  ),
1045
  Section(
1046
  cluster_div,
1047
+ id="section55"
1048
  ),
1049
  id="inner-text"
1050
  )