mylibrar commited on
Commit
a711d2f
1 Parent(s): 29ec6a3

topic-analysis (#1)

Browse files

- Add topic analysis (d40cd38cb5959666f569699c1388f60dd0831bd5)
- Fix issues in topic analysis (cb27b88e7b216ab08d98928c4f7bc92313521ae9)
- Add comments to each topic graph (3dd0859f7cebc297557e5d327a5f6af82dea16f5)
- Merge branch 'main' of https://huggingface.co/spaces/LLM360/TxT360-New into pr/1 (9fc9d4a3ec7a1cc20176b2c99faaaa3330a40a60)
- Merge branch 'main' of https://huggingface.co/spaces/LLM360/TxT360-New into pr/1 (30fd7fc71eb78a6dec2c9d686f0db41c42e4941a)

Files changed (3) hide show
  1. data/topic_charts.json +0 -0
  2. main.py +6 -0
  3. results.py +50 -0
data/topic_charts.json ADDED
The diff for this file is too large to render. See raw diff
 
main.py CHANGED
@@ -352,6 +352,12 @@ def main():
352
  href="#section53",
353
  )
354
  ),
 
 
 
 
 
 
355
  ),
356
  ),
357
  role="navigation",
 
352
  href="#section53",
353
  )
354
  ),
355
+ Li(
356
+ A(
357
+ "Topic Analysis",
358
+ href="#section55",
359
+ )
360
+ )
361
  ),
362
  ),
363
  role="navigation",
results.py CHANGED
@@ -830,6 +830,7 @@ intro_div = Div(
830
  Ul(
831
  Li("The Learning Curve of TxT360 with an Upsampling Recipe", style = "margin-bottom: 5px"),
832
  Li("Perplexity Analysis across time", style = "margin-bottom: 5px"),
 
833
  Li(B("Estimated Reading Time: 15 minutes"), style = "margin-bottom: 5px"),
834
  ),
835
  )
@@ -965,6 +966,51 @@ llama_div = Div(
965
  ),
966
  )
967
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
968
 
969
  def results():
970
  return Div(
@@ -986,6 +1032,10 @@ def results():
986
  ),
987
  Section(
988
  llama_div,
 
 
 
 
989
  ),
990
  id="inner-text"
991
  )
 
830
  Ul(
831
  Li("The Learning Curve of TxT360 with an Upsampling Recipe", style = "margin-bottom: 5px"),
832
  Li("Perplexity Analysis across time", style = "margin-bottom: 5px"),
833
+ Li("Topic Analysis on Data Cluster Groups", style = "margin-bottom: 5px"),
834
  Li(B("Estimated Reading Time: 15 minutes"), style = "margin-bottom: 5px"),
835
  ),
836
  )
 
966
  ),
967
  )
968
 
969
+ with open(os.path.join(os.path.dirname(__file__), "data", "topic_charts.json"), 'r') as f:
970
+ topic_charts = json.load(f)
971
+ topic_graphs = []
972
+
973
+ for title, data in topic_charts:
974
+ if data["type"] == "barh":
975
+ topic_graphs.append(go.Figure(go.Bar(
976
+ x=data["kwargs"]["width"],
977
+ y=data["kwargs"]['y'],
978
+ orientation='h',
979
+ marker_color=[
980
+ "rgb(" + ", ".join(str(val * 255) for val in rgb) + ')'
981
+ for rgb in data["kwargs"]["color"]
982
+ ]
983
+ )))
984
+ elif data["type"] == "pie":
985
+ topic_graphs.append(go.Figure(go.Pie(
986
+ values=data["kwargs"]['x'],
987
+ labels=data["kwargs"]["labels"],
988
+ marker_colors=[
989
+ "rgb(" + ", ".join(str(val * 255) for val in rgb) + ')'
990
+ for rgb in data["kwargs"]["colors"]
991
+ ]
992
+ )))
993
+
994
+ cluster_div = Div(
995
+ Section(
996
+ H2("Topic Analysis"),
997
+ P("We tried to classify data into topic groups and looked for correlations between topics and statistics of data. Data from different topic groups should manifest different characteristics of distribution, which can give us some insight into the composition of dataset."),
998
+ H3("Methodology"),
999
+ P("We took the ", A("common crawl", href="https://commoncrawl.org/"), " data and clustered them into 17 topic groups using ", A("BERTopic", href="https://maartengr.github.io/BERTopic/index.html"), ". We collected and aggregated a series of metrics which include quality signals and other useful metadata. For each topic group, we calculated average scores and generated the corresponding bar charts over different metrics for comparison and analysis."),
1000
+ H3("Cluster Groups"),
1001
+ P("We grouped data into the following 17 clusters"),
1002
+ Ul(*(
1003
+ Li(topic_name, style = "margin-bottom: 5px")
1004
+ for topic_name in ("Arts", "Business & Economics & Finance", "Culture & Cultural geography", "Daily Life & Home & Lifestyle", "Education", "Entertainment & Travel & Hobby", "Environment", "Food & Drink & Cooking", "Health & Wellness & Medicine", "Law & Justice", "Natural Science & Formal Science & Technology", "Personal Development & Human Resources & Career", "Politics & Government", "Religion & Spirituality", "Shopping & Commodity", "Society & Social Issues & Human Rights", "Sports")
1005
+ )),
1006
+ H3("Results Analysis"),
1007
+ *(
1008
+ Section(H4(title), plotly2fasthtml(topic_graphs[i]), P(data.get("comment", '')))
1009
+ for i, (title, data) in enumerate(topic_charts)
1010
+ )
1011
+ )
1012
+ )
1013
+
1014
 
1015
  def results():
1016
  return Div(
 
1032
  ),
1033
  Section(
1034
  llama_div,
1035
+ ),
1036
+ Section(
1037
+ cluster_div,
1038
+ id="section55"
1039
  ),
1040
  id="inner-text"
1041
  )