File size: 4,791 Bytes
3932ad8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import gradio as gr
from transformers import pipeline
from transformers_interpret import SequenceClassificationExplainer
from bs4 import BeautifulSoup


# Setup model
classifier = pipeline("text-classification", model="simonschoe/TransformationTransformer")
explainer = SequenceClassificationExplainer(classifier.model, classifier.tokenizer)

legend = """
<div style="text-align: center; display: block; margin-left: auto; margin-right: auto; border-top: 1px solid; margin-top: 5px; padding-top: 5px;"><b>Legend: </b><span style="display: inline-block; width: 10px; height: 10px; border: 1px solid; background-color: hsl(0, 75%, 60%)"></span> Generic  <span style="display: inline-block; width: 10px; height: 10px; border: 1px solid; background-color: hsl(120, 75%, 50%)"></span> Transformation  </div>
"""

def classify(_input):
    """
    wrapper method to compute label 1 probability and explanation for given input
    """
    result = classifier(_input)[0]
    score = result['score']
    if result['label'] == 'LABEL_0':
        score = 1-score

    # getting visualization
    attributions = explainer(_input)
    html = explainer.visualize().__html__()

    soup = BeautifulSoup(html, 'html.parser')
    explanation = soup.find_all('td')[-1].__str__().replace('td', 'div')
    # adding legend to word importance explanation
    result_html = explanation + legend
    return score, result_html

app = gr.Blocks()

with app:
    gr.Markdown("# Call2Vec")
    gr.Markdown("## Semantic Search in Quarterly Earnings Conference Calls")
    with gr.Row():
        with gr.Column():
            text_in = gr.Textbox(lines=1, placeholder="Insert text", label="Search Query")
            with gr.Row():
                compute_bt = gr.Button("Calculate")
            score_out = gr.Number(label="Label 1 probability", interactive=False)
            html_out = gr.HTML(label="Explanation")
        with gr.Column():
            gr.Markdown(
                """
                #### Project Description
                Call2Vec is a [fastText](https://fasttext.cc/) word embedding model trained via [Gensim](https://radimrehurek.com/gensim/). It maps each token in the vocabulary into a dense, 300-dimensional vector space, designed for performing semantic search.
                The model is trained on a large sample of quarterly earnings conference calls, held by U.S. firms during the 2006-2022 period. In particular, the training data is restriced to the (rather sponentous) executives' remarks of the Q&A section of the call. The data has been preprocessed prior to model training via stop word removal, lemmatization, named entity masking, and coocurrence modeling.
                """
            )
            gr.Markdown(
                """
                #### App usage
                The model is intented to be used for **semantic search**: It encodes the search query (entered in the textbox on the right) in a dense vector space and finds semantic neighbours, i.e., token which frequently occur within similar contexts in the underlying training data.
                The model allows for two use cases:
                1. *Single Search:* The input query consists of a single word. When provided a bi-, tri-, or even fourgram, the quality of the model output depends on the presence of the query token in the model's vocabulary. N-grams should be concated by an underscore (e.g., "machine_learning" or "artifical_intelligence").
                2. *Multi Search:* The input query may consist of several words or n-grams, seperated by comma, semi-colon or newline. It then computes the average vector over all inputs and performs semantic search based on the average input token.
                """
            )
            gr.Examples(
                examples=[["Now Accord networks is a company in video, and he led the sales team, and the marketing group at Accord, and he took it from start up, sound familiar, it's from start up to $60 million company in two years."], ["Another test sentence"], ["Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam"]],
                inputs=[text_in],
                outputs=[score_out, html_out],
                fn=classify,
                cache_examples=True
            )
    gr.Markdown(
        """
        <p style="text-align: center;">
            Call2Vec by X and Y
            <br>
            <img id="visitor-badge" alt="visitor badge" src="https://visitor-badge.glitch.me/badge?page_id=simonschoe.call2vec&left_color=green&right_color=blue" style="display: block; margin-left: auto; margin-right: auto;"/>
        </p>
        """
    )
    compute_bt.click(classify, inputs=[text_in], outputs=[score_out, html_out])


app.launch()