simonschoe commited on
Commit
3932ad8
β€’
1 Parent(s): 3488f7c
Files changed (3) hide show
  1. README.md +7 -4
  2. app.py +85 -0
  3. requirements.txt +2 -0
README.md CHANGED
@@ -1,12 +1,15 @@
1
  ---
2
- title: TIC
3
- emoji: 🐨
4
- colorFrom: green
5
  colorTo: red
6
  sdk: gradio
7
  sdk_version: 3.1.4
8
  app_file: app.py
9
- pinned: false
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
  ---
2
+ title: TIClassifier
3
+ emoji: πŸ“ƒβœ¨
4
+ colorFrom: blue
5
  colorTo: red
6
  sdk: gradio
7
  sdk_version: 3.1.4
8
  app_file: app.py
9
+ models: TransformationTransformer
10
+ pinned: true
11
+
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
15
+
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ from transformers_interpret import SequenceClassificationExplainer
4
+ from bs4 import BeautifulSoup
5
+
6
+
7
+ # Setup model
8
+ classifier = pipeline("text-classification", model="simonschoe/TransformationTransformer")
9
+ explainer = SequenceClassificationExplainer(classifier.model, classifier.tokenizer)
10
+
11
+ legend = """
12
+ <div style="text-align: center; display: block; margin-left: auto; margin-right: auto; border-top: 1px solid; margin-top: 5px; padding-top: 5px;"><b>Legend: </b><span style="display: inline-block; width: 10px; height: 10px; border: 1px solid; background-color: hsl(0, 75%, 60%)"></span> Generic <span style="display: inline-block; width: 10px; height: 10px; border: 1px solid; background-color: hsl(120, 75%, 50%)"></span> Transformation </div>
13
+ """
14
+
15
+ def classify(_input):
16
+ """
17
+ wrapper method to compute label 1 probability and explanation for given input
18
+ """
19
+ result = classifier(_input)[0]
20
+ score = result['score']
21
+ if result['label'] == 'LABEL_0':
22
+ score = 1-score
23
+
24
+ # getting visualization
25
+ attributions = explainer(_input)
26
+ html = explainer.visualize().__html__()
27
+
28
+ soup = BeautifulSoup(html, 'html.parser')
29
+ explanation = soup.find_all('td')[-1].__str__().replace('td', 'div')
30
+ # adding legend to word importance explanation
31
+ result_html = explanation + legend
32
+ return score, result_html
33
+
34
+ app = gr.Blocks()
35
+
36
+ with app:
37
+ gr.Markdown("# Call2Vec")
38
+ gr.Markdown("## Semantic Search in Quarterly Earnings Conference Calls")
39
+ with gr.Row():
40
+ with gr.Column():
41
+ text_in = gr.Textbox(lines=1, placeholder="Insert text", label="Search Query")
42
+ with gr.Row():
43
+ compute_bt = gr.Button("Calculate")
44
+ score_out = gr.Number(label="Label 1 probability", interactive=False)
45
+ html_out = gr.HTML(label="Explanation")
46
+ with gr.Column():
47
+ gr.Markdown(
48
+ """
49
+ #### Project Description
50
+ Call2Vec is a [fastText](https://fasttext.cc/) word embedding model trained via [Gensim](https://radimrehurek.com/gensim/). It maps each token in the vocabulary into a dense, 300-dimensional vector space, designed for performing semantic search.
51
+ The model is trained on a large sample of quarterly earnings conference calls, held by U.S. firms during the 2006-2022 period. In particular, the training data is restriced to the (rather sponentous) executives' remarks of the Q&A section of the call. The data has been preprocessed prior to model training via stop word removal, lemmatization, named entity masking, and coocurrence modeling.
52
+ """
53
+ )
54
+ gr.Markdown(
55
+ """
56
+ #### App usage
57
+ The model is intented to be used for **semantic search**: It encodes the search query (entered in the textbox on the right) in a dense vector space and finds semantic neighbours, i.e., token which frequently occur within similar contexts in the underlying training data.
58
+ The model allows for two use cases:
59
+ 1. *Single Search:* The input query consists of a single word. When provided a bi-, tri-, or even fourgram, the quality of the model output depends on the presence of the query token in the model's vocabulary. N-grams should be concated by an underscore (e.g., "machine_learning" or "artifical_intelligence").
60
+ 2. *Multi Search:* The input query may consist of several words or n-grams, seperated by comma, semi-colon or newline. It then computes the average vector over all inputs and performs semantic search based on the average input token.
61
+ """
62
+ )
63
+ gr.Examples(
64
+ examples=[["Now Accord networks is a company in video, and he led the sales team, and the marketing group at Accord, and he took it from start up, sound familiar, it's from start up to $60 million company in two years."], ["Another test sentence"], ["Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam"]],
65
+ inputs=[text_in],
66
+ outputs=[score_out, html_out],
67
+ fn=classify,
68
+ cache_examples=True
69
+ )
70
+ gr.Markdown(
71
+ """
72
+ <p style="text-align: center;">
73
+ Call2Vec by X and Y
74
+ <br>
75
+ <img id="visitor-badge" alt="visitor badge" src="https://visitor-badge.glitch.me/badge?page_id=simonschoe.call2vec&left_color=green&right_color=blue" style="display: block; margin-left: auto; margin-right: auto;"/>
76
+ </p>
77
+ """
78
+ )
79
+ compute_bt.click(classify, inputs=[text_in], outputs=[score_out, html_out])
80
+
81
+
82
+ app.launch()
83
+
84
+
85
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ bs4==4.4.0
2
+ transformers_interpret==0.7.2