|
import streamlit as st |
|
|
|
from transformers import MarkupLMProcessor, MarkupLMForQuestionAnswering |
|
|
|
import requests |
|
from bs4 import BeautifulSoup |
|
|
|
import numpy as np |
|
|
|
import torch |
|
import torch.nn.functional as F |
|
|
|
|
|
MAX_LEN = 512 |
|
STRIDE = 100 |
|
|
|
|
|
MAX_ANSWER_LEN = 30 |
|
MIN_CONFIDENCE = 0.9 |
|
|
|
|
|
MODEL_STR = "microsoft/markuplm-base-finetuned-websrc" |
|
|
|
|
|
processor = MarkupLMProcessor.from_pretrained(MODEL_STR) |
|
model = MarkupLMForQuestionAnswering.from_pretrained(MODEL_STR) |
|
|
|
headers = { |
|
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36', |
|
} |
|
|
|
|
|
input_url = st.text_input( |
|
label="Enter url of page to scrape", |
|
value="https://finance.yahoo.com/quote/META/", |
|
key="url", |
|
) |
|
|
|
input_question = st.text_input( |
|
label="Enter Question", |
|
value="What is the market capitalization?", |
|
key="question", |
|
) |
|
|
|
st.write("Getting html page ...") |
|
|
|
|
|
page = requests.get(input_url, headers=headers) |
|
|
|
|
|
soup = BeautifulSoup(page.content, "html.parser") |
|
|
|
|
|
body = soup.find('body') |
|
|
|
html_string = str(body) |
|
len(html_string) |
|
|
|
|
|
encoding = processor(html_string, questions=input_question, return_tensors="pt", truncation="only_second", |
|
stride=STRIDE, max_length=MAX_LEN, return_overflowing_tokens=True, padding=True) |
|
|
|
|
|
del encoding['overflow_to_sample_mapping'] |
|
encoding['token_type_ids'] = encoding['token_type_ids'].fill_(0) |
|
|
|
|
|
n_segments = encoding['input_ids'].shape[0] |
|
question_index = encoding[0].tokens.index('</s>') |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**encoding) |
|
|
|
|
|
start_probs = F.softmax(outputs.start_logits, dim=1).numpy() |
|
end_probs = F.softmax(outputs.end_logits, dim=1).numpy() |
|
|
|
|
|
answers = [] |
|
|
|
for i in range(n_segments): |
|
|
|
start_index = np.argmax(start_probs[i]) |
|
end_index = np.argmax(end_probs[i]) |
|
confidence = max(start_probs[i]) * max(end_probs[i]) |
|
|
|
if end_index > start_index and end_index - start_index <= MAX_ANSWER_LEN and start_index > question_index and end_index > question_index and confidence > MIN_CONFIDENCE: |
|
|
|
predict_answer_tokens = encoding.input_ids[0, start_index : end_index + 1] |
|
answer = processor.decode(predict_answer_tokens, skip_special_tokens=True) |
|
|
|
answers.append({"answer": answer, "confidence": confidence}) |
|
|
|
|
|
for answer in answers: |
|
st.write(answer) |
|
|
|
st.write("Done!") |