doc2query / app.py
Sean MacAvaney
a few fixups
845df70
raw
history blame contribute delete
No virus
5.7 kB
import pyterrier as pt
pt.init()
import numpy as np
import pandas as pd
import gradio as gr
from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter
from pyterrier_dr import ElectraScorer
from pyterrier_gradio import Demo, MarkdownFile, interface, df2code, code2md, EX_D
MODEL = 'macavaney/doc2query-t5-base-msmarco'
SCORE_MODEL = 'crystina-z/monoELECTRA_LCE_nneg31'
PERCENTILES_BY_5 = np.array([-3.80468750e+00, -2.21679688e+00, -1.25683594e+00, -5.58105469e-01, -7.65323639e-04, 4.69482422e-01, 8.83300781e-01, 1.25878906e+00, 1.61035156e+00, 1.94335938e+00, 2.26562500e+00, 2.58007812e+00, 2.89648438e+00, 3.21484375e+00, 3.54687500e+00, 3.90039062e+00, 4.30078125e+00, 4.77343750e+00, 5.37109375e+00])
COLORS = ['rgb(252, 132, 100)','rgb(252, 148, 116)','rgb(252, 166, 137)','rgb(252, 183, 156)','rgb(253, 200, 178)','rgb(254, 215, 198)','rgb(255, 228, 216)','rgb(255, 237, 228)','rgb(256, 245, 240)','rgb(256, 256, 256)','rgb(247, 252, 245)','rgb(240, 250, 237)','rgb(233, 247, 228)','rgb(222, 242, 216)','rgb(209, 237, 203)','rgb(195, 232, 188)','rgb(180, 225, 173)','rgb(163, 218, 157)','rgb(145, 210, 142)','rgb(125, 201, 126)']
doc2query = Doc2Query(MODEL, append=True, num_samples=5)
electra = ElectraScorer()
query_scorer = QueryScorer(electra)
query_filter = QueryFilter(t=0, append=False)
COLAB_NAME = 'pyterrier_doc2query.ipynb'
COLAB_INSTALL = '''
!pip install -q git+https://github.com/terrier-org/pyterrier
!pip install -q git+https://github.com/terrierteam/pyterrier_doc2query
'''.strip()
COLAB_INSTALL_MM = COLAB_INSTALL + '\n!pip install -q git+https://github.com/terrierteam/pyterrier_dr faiss-cpu'
def predict(input, model, append, num_samples):
assert model == MODEL
doc2query.append = append
doc2query.num_samples = num_samples
code = f'''import pandas as pd
from pyterrier_doc2query import Doc2Query
doc2query = Doc2Query({repr(model)}, append={append}, num_samples={num_samples})
doc2query({df2code(input)})
'''
res = doc2query(input)
vis = generate_vis(res)
return (doc2query(input), code2md(code, COLAB_INSTALL, COLAB_NAME), vis)
def generate_vis(df):
result = []
for row in df.itertuples(index=False):
qs = []
if hasattr(row, 'querygen_score'):
for q, score in zip(row.querygen.split('\n'), row.querygen_score):
bucket = np.searchsorted(PERCENTILES_BY_5, score)
color = COLORS[bucket]
percentile = bucket * 5
qs.append(f'''
<div>
<span title="score={score:.4f}, in the {percentile}th percentile of scores" style="border: 1px solid #888; border-radius: 3px; font-size: 0.6em; font-family: monospace; background-color: {color}; padding: 1px 3px;">{percentile}th</span> {q}
</div>
''')
elif hasattr(row, 'querygen'):
for q in row.querygen.split('\n'):
qs.append(f'''
<div>{q}</div>
''')
qs = '\n'.join(qs)
if qs:
qs = f'''
<div><strong>Expansion Queries:</strong></div>
{qs}
'''
text = row.text.replace('\n', '<br/>')
result.append(f'''
<div style="font-size: 1.2em;">Document: <strong>{row.docno}</strong></div>
<div style="margin: 4px 0 16px; padding: 4px; border: 1px solid black;">
<div>
{text}
</div>
{qs}
</div>
''')
return '\n'.join(result)
def predict_mm(input, model, num_samples, score_model, filter_pct):
assert model == MODEL
assert score_model == SCORE_MODEL
doc2query.append = False
doc2query.num_samples = num_samples
if filter_pct > 0:
query_filter.t = PERCENTILES_BY_5[filter_pct//5-1]
pipeline = doc2query >> query_scorer >> query_filter
code = f'''import pyterrier as pt ; pt.init()
import pandas as pd
from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter
from pyterrier_dr import ElectraScorer
doc2query = Doc2Query({repr(model)}, append=False, num_samples={num_samples})
scorer = ElectraScorer({repr(score_model)})
pipeline = doc2query >> QueryScorer(scorer) >> QueryFilter(append=False, t={query_filter.t})
# use append=True when indexing; t={query_filter.t} is the {filter_pct}th percentile for generated queries on MS MARCO
pipeline({df2code(input)})
'''
else:
pipeline = doc2query >> query_scorer
code = f'''import pyterrier as pt ; pt.init()
import pandas as pd
from pyterrier_doc2query import Doc2Query, QueryScorer
from pyterrier_dr import ElectraScorer
doc2query = Doc2Query({repr(model)}, append=False, num_samples={num_samples})
scorer = ElectraScorer({repr(score_model)})
pipeline = doc2query >> QueryScorer(scorer)
pipeline({df2code(input)})
'''
res = pipeline(input)
vis = generate_vis(res)
res['querygen_score'] = res['querygen_score'].apply(lambda x: '[ ' + ', '.join(str(v) for v in x) + ' ]')
return (res, code2md(code, COLAB_INSTALL_MM, COLAB_NAME), vis)
interface(
MarkdownFile('README.md'),
Demo(
predict,
EX_D,
[
gr.Dropdown(
choices=[MODEL],
value=MODEL,
label='Model',
interactive=False,
), gr.Checkbox(
value=doc2query.append,
label="Append",
), gr.Slider(
minimum=1,
maximum=10,
value=doc2query.num_samples,
step=1.,
label='# Queries'
)],
),
MarkdownFile('mm.md'),
Demo(
predict_mm,
EX_D,
[
gr.Dropdown(
choices=[MODEL],
value=MODEL,
label='Model',
interactive=False,
), gr.Slider(
minimum=1,
maximum=10,
value=doc2query.num_samples,
step=1.,
label='# Queries'
), gr.Dropdown(
choices=[SCORE_MODEL],
value=SCORE_MODEL,
label='Scorer',
interactive=False,
), gr.Slider(
minimum=0,
maximum=95,
value=10,
step=5,
label='Filter (top % of queries)'
)],
),
MarkdownFile('wrapup.md'),
).launch(share=False)