import gradio as gr
from transformers import pipeline
from transformers_interpret import SequenceClassificationExplainer
from bs4 import BeautifulSoup
# Setup model
classifier = pipeline("text-classification", model="simonschoe/TransformationTransformer")
explainer = SequenceClassificationExplainer(classifier.model, classifier.tokenizer)
legend = """
Legend: Generic Transformation
"""
def classify(_input):
"""
wrapper method to compute label 1 probability and explanation for given input
"""
result = classifier(_input)[0]
score = result['score']
if result['label'] == 'LABEL_0':
score = 1-score
# getting visualization
attributions = explainer(_input)
html = explainer.visualize().__html__()
soup = BeautifulSoup(html, 'html.parser')
explanation = soup.find_all('td')[-1].__str__().replace('td', 'div')
# adding legend to word importance explanation
result_html = explanation + legend
return score, result_html
app = gr.Blocks()
with app:
gr.Markdown("# Call2Vec")
gr.Markdown("## Semantic Search in Quarterly Earnings Conference Calls")
with gr.Row():
with gr.Column():
text_in = gr.Textbox(lines=1, placeholder="Insert text", label="Search Query")
with gr.Row():
compute_bt = gr.Button("Calculate")
score_out = gr.Number(label="Label 1 probability", interactive=False)
html_out = gr.HTML(label="Explanation")
with gr.Column():
gr.Markdown(
"""
#### Project Description
Call2Vec is a [fastText](https://fasttext.cc/) word embedding model trained via [Gensim](https://radimrehurek.com/gensim/). It maps each token in the vocabulary into a dense, 300-dimensional vector space, designed for performing semantic search.
The model is trained on a large sample of quarterly earnings conference calls, held by U.S. firms during the 2006-2022 period. In particular, the training data is restriced to the (rather sponentous) executives' remarks of the Q&A section of the call. The data has been preprocessed prior to model training via stop word removal, lemmatization, named entity masking, and coocurrence modeling.
"""
)
gr.Markdown(
"""
#### App usage
The model is intented to be used for **semantic search**: It encodes the search query (entered in the textbox on the right) in a dense vector space and finds semantic neighbours, i.e., token which frequently occur within similar contexts in the underlying training data.
The model allows for two use cases:
1. *Single Search:* The input query consists of a single word. When provided a bi-, tri-, or even fourgram, the quality of the model output depends on the presence of the query token in the model's vocabulary. N-grams should be concated by an underscore (e.g., "machine_learning" or "artifical_intelligence").
2. *Multi Search:* The input query may consist of several words or n-grams, seperated by comma, semi-colon or newline. It then computes the average vector over all inputs and performs semantic search based on the average input token.
"""
)
gr.Examples(
examples=[["Now Accord networks is a company in video, and he led the sales team, and the marketing group at Accord, and he took it from start up, sound familiar, it's from start up to $60 million company in two years."], ["Another test sentence"], ["Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam"]],
inputs=[text_in],
outputs=[score_out, html_out],
fn=classify,
cache_examples=True
)
gr.Markdown(
"""
Call2Vec by X and Y
"""
)
compute_bt.click(classify, inputs=[text_in], outputs=[score_out, html_out])
app.launch()