Sean MacAvaney commited on
Commit
096a82e
1 Parent(s): d40a755

minusminus

Browse files
Files changed (4) hide show
  1. app.py +97 -3
  2. mm.md +19 -0
  3. requirements.txt +3 -1
  4. wrapup.md +3 -2
app.py CHANGED
@@ -1,17 +1,27 @@
 
 
 
1
  import pandas as pd
2
  import gradio as gr
3
- from pyterrier_doc2query import Doc2Query
 
4
  from pyterrier_gradio import Demo, MarkdownFile, interface, df2code, code2md, EX_D
5
 
6
  MODEL = 'macavaney/doc2query-t5-base-msmarco'
 
 
 
7
 
8
  doc2query = Doc2Query(MODEL, append=True, num_samples=5)
 
 
9
 
10
  COLAB_NAME = 'pyterrier_doc2query.ipynb'
11
  COLAB_INSTALL = '''
12
  !pip install -q git+https://github.com/terrier-org/pyterrier
13
  !pip install -q git+https://github.com/terrierteam/pyterrier_doc2query
14
  '''.strip()
 
15
 
16
  def predict(input, model, append, num_samples):
17
  assert model == MODEL
@@ -24,7 +34,68 @@ doc2query = Doc2Query({repr(model)}, append={append}, num_samples={num_samples})
24
 
25
  doc2query({df2code(input)})
26
  '''
27
- return (doc2query(input), code2md(code, COLAB_INSTALL, COLAB_NAME))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  interface(
30
  MarkdownFile('README.md'),
@@ -48,5 +119,28 @@ interface(
48
  label='# Queries'
49
  )],
50
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  MarkdownFile('wrapup.md'),
52
- ).launch(share=False)
 
1
+ import pyterrier as pt
2
+ pt.init()
3
+ import numpy as np
4
  import pandas as pd
5
  import gradio as gr
6
+ from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter
7
+ from pyterrier_dr import ElectraScorer
8
  from pyterrier_gradio import Demo, MarkdownFile, interface, df2code, code2md, EX_D
9
 
10
  MODEL = 'macavaney/doc2query-t5-base-msmarco'
11
+ SCORE_MODEL = 'crystina-z/monoELECTRA_LCE_nneg31'
12
+ PERCENTILES_BY_5 = np.array([-3.80468750e+00, -2.21679688e+00, -1.25683594e+00, -5.58105469e-01, -7.65323639e-04, 4.69482422e-01, 8.83300781e-01, 1.25878906e+00, 1.61035156e+00, 1.94335938e+00, 2.26562500e+00, 2.58007812e+00, 2.89648438e+00, 3.21484375e+00, 3.54687500e+00, 3.90039062e+00, 4.30078125e+00, 4.77343750e+00, 5.37109375e+00])
13
+ COLORS = ['rgb(252, 132, 100)','rgb(252, 148, 116)','rgb(252, 166, 137)','rgb(252, 183, 156)','rgb(253, 200, 178)','rgb(254, 215, 198)','rgb(255, 228, 216)','rgb(255, 237, 228)','rgb(256, 245, 240)','rgb(256, 256, 256)','rgb(247, 252, 245)','rgb(240, 250, 237)','rgb(233, 247, 228)','rgb(222, 242, 216)','rgb(209, 237, 203)','rgb(195, 232, 188)','rgb(180, 225, 173)','rgb(163, 218, 157)','rgb(145, 210, 142)','rgb(125, 201, 126)']
14
 
15
  doc2query = Doc2Query(MODEL, append=True, num_samples=5)
16
+ electra = ElectraScorer()
17
+ query_scorer = QueryScorer(electra)
18
 
19
  COLAB_NAME = 'pyterrier_doc2query.ipynb'
20
  COLAB_INSTALL = '''
21
  !pip install -q git+https://github.com/terrier-org/pyterrier
22
  !pip install -q git+https://github.com/terrierteam/pyterrier_doc2query
23
  '''.strip()
24
+ COLAB_INSTALL_MM = COLAB_INSTALL + '\n!pip install -q git+https://github.com/terrierteam/pyterrier_dr faiss-cpu'
25
 
26
  def predict(input, model, append, num_samples):
27
  assert model == MODEL
 
34
 
35
  doc2query({df2code(input)})
36
  '''
37
+ res = doc2query(input)
38
+ vis = generate_vis(res)
39
+ return (doc2query(input), code2md(code, COLAB_INSTALL, COLAB_NAME), vis)
40
+
41
+ def generate_vis(df):
42
+ result = []
43
+ for row in df.itertuples(index=False):
44
+ qs = []
45
+ if hasattr(row, 'querygen_score'):
46
+ for q, score in zip(row.querygen.split('\n'), row.querygen_score):
47
+ bucket = np.searchsorted(PERCENTILES_BY_5, score)
48
+ color = COLORS[bucket]
49
+ percentile = bucket * 5
50
+ qs.append(f'''
51
+ <div>
52
+ <span title="score={score:.4f}, in the {percentile}th percentile of scores" style="border: 1px solid #888; border-radius: 3px; font-size: 0.6em; font-family: monospace; background-color: {color}; padding: 1px 3px;">{percentile}th</span> {q}
53
+ </div>
54
+ ''')
55
+ elif hasattr(row, 'querygen'):
56
+ for q in row.querygen.split('\n'):
57
+ qs.append(f'''
58
+ <div>{q}</div>
59
+ ''')
60
+ qs = '\n'.join(qs)
61
+ if qs:
62
+ qs = f'''
63
+ <div><strong>Expansion Queries:</strong></div>
64
+ {qs}
65
+ '''
66
+ text = row.text.replace('\n', '<br/>')
67
+ result.append(f'''
68
+ <div style="font-size: 1.2em;">Document: <strong>{row.docno}</strong></div>
69
+ <div style="margin: 4px 0 16px; padding: 4px; border: 1px solid black;">
70
+ <div>
71
+ {text}
72
+ </div>
73
+ {qs}
74
+ </div>
75
+ ''')
76
+ return '\n'.join(result)
77
+
78
+ def predict_mm(input, model, num_samples, score_model):
79
+ assert model == MODEL
80
+ assert score_model == SCORE_MODEL
81
+ doc2query.append = False
82
+ doc2query.num_samples = num_samples
83
+ pipeline = doc2query >> query_scorer
84
+ code = f'''import pyterrier as pt ; pt.init()
85
+ import pandas as pd
86
+ from pyterrier_doc2query import Doc2Query, QueryScorer
87
+ from pyterrier_dr import ElectraScorer
88
+
89
+ doc2query = Doc2Query({repr(model)}, append=False, num_samples={num_samples})
90
+ scorer = ElectraScorer({repr(score_model)})
91
+ pipeline = doc2query >> QueryScorer(scorer)
92
+
93
+ pipeline({df2code(input)})
94
+ '''
95
+ res = pipeline(input)
96
+ vis = generate_vis(res)
97
+ res['querygen_score'] = res['querygen_score'].apply(lambda x: '[ ' + ', '.join(str(v) for v in x) + ' ]')
98
+ return (res, code2md(code, COLAB_INSTALL_MM, COLAB_NAME), vis)
99
 
100
  interface(
101
  MarkdownFile('README.md'),
 
119
  label='# Queries'
120
  )],
121
  ),
122
+ MarkdownFile('mm.md'),
123
+ Demo(
124
+ predict_mm,
125
+ EX_D,
126
+ [
127
+ gr.Dropdown(
128
+ choices=[MODEL],
129
+ value=MODEL,
130
+ label='Model',
131
+ interactive=False,
132
+ ), gr.Slider(
133
+ minimum=1,
134
+ maximum=10,
135
+ value=doc2query.num_samples,
136
+ step=1.,
137
+ label='# Queries'
138
+ ), gr.Dropdown(
139
+ choices=[SCORE_MODEL],
140
+ value=SCORE_MODEL,
141
+ label='Filter',
142
+ interactive=False,
143
+ )],
144
+ ),
145
  MarkdownFile('wrapup.md'),
146
+ ).launch(share=True)
mm.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Doc2Query&minus;&minus;: When Less is More
2
+
3
+ You might notice that not all the generated queries look related to the source text. This is due
4
+ to a defect that can appear in sequence-to-sequence models known as "[hallucination](https://aclanthology.org/2020.acl-main.173/)".
5
+
6
+ Doc2Query&minus;&minus; can filter out these low-quality queries by measuring the relevance between them and the text that
7
+ generated them using a scoring transformer `S`. It is applied as two transformers that follow the Doc2Query generator:
8
+
9
+ <div class="pipeline">
10
+ <div class="df" title="Document Frame">D</div>
11
+ <div class="transformer" title="Doc2Query Transformer">Doc2Query</div>
12
+ <div class="df" title="Document Frame">D</div>
13
+ <div class="transformer attn" title="Doc2Query Transformer">QueryScorer
14
+ <div class="artefact" title="Scorer Transformer">S</div>
15
+ </div>
16
+ <div class="df" title="Document Frame">D</div>
17
+ <div class="transformer attn" title="Doc2Query Transformer">QueryFilter</div>
18
+ <div class="df" title="Document Frame">D</div>
19
+ </div>
requirements.txt CHANGED
@@ -1,5 +1,7 @@
1
  git+https://github.com/seanmacavaney/[email protected]
2
  git+https://github.com/terrier-org/pyterrier
3
- git+https://github.com/terrierteam/pyterrier_doc2query@master
 
4
  ir_datasets
5
  ir_measures
 
 
1
  git+https://github.com/seanmacavaney/[email protected]
2
  git+https://github.com/terrier-org/pyterrier
3
+ git+https://github.com/terrierteam/pyterrier_doc2query@minusminus
4
+ git+https://github.com/terrierteam/pyterrier_dr
5
  ir_datasets
6
  ir_measures
7
+ faiss-cpu
wrapup.md CHANGED
@@ -1,10 +1,10 @@
1
  ### Putting it all together
2
 
3
- You can use Doc2Query in an indexing pipeline to build an index of the expanded documents:
4
 
5
  <div class="pipeline">
6
  <div class="df" title="Document Frame">D</div>
7
- <div class="transformer attn" title="Doc2Query Transformer">Doc2Query</div>
8
  <div class="df" title="Document Frame">D</div>
9
  <div class="transformer" title="Indexer">Indexer</div>
10
  <div class="artefact" title="Doc2Query Index">IDX</div>
@@ -39,4 +39,5 @@ bm25 = pt.BatchRetrieve('./msmarco_psg', wmodel="BM25")
39
  ### References & Credits
40
 
41
  - Rodrigo Nogueira and Jimmy Lin. [From doc2query to docTTTTTquery](https://cs.uwaterloo.ca/~jimmylin/publications/Nogueira_Lin_2019_docTTTTTquery-v2.pdf).
 
42
  - Craig Macdonald, Nicola Tonellotto, Sean MacAvaney, Iadh Ounis. [PyTerrier: Declarative Experimentation in Python from BM25 to Dense Retrieval](https://dl.acm.org/doi/abs/10.1145/3459637.3482013). CIKM 2021.
 
1
  ### Putting it all together
2
 
3
+ You can use Doc2Query or Doc2Query-- in an indexing pipeline to build an index of the expanded documents:
4
 
5
  <div class="pipeline">
6
  <div class="df" title="Document Frame">D</div>
7
+ <div class="transformer attn" title="Doc2Query or Doc2Query&minus;&minus; Transformer">Doc2Query[&minus;&minus;]</div>
8
  <div class="df" title="Document Frame">D</div>
9
  <div class="transformer" title="Indexer">Indexer</div>
10
  <div class="artefact" title="Doc2Query Index">IDX</div>
 
39
  ### References & Credits
40
 
41
  - Rodrigo Nogueira and Jimmy Lin. [From doc2query to docTTTTTquery](https://cs.uwaterloo.ca/~jimmylin/publications/Nogueira_Lin_2019_docTTTTTquery-v2.pdf).
42
+ - Mitko Gospodinov, Sean MacAvaney, and Craig Macdonald. Doc2Query--: When Less is More. ECIR 2023.
43
  - Craig Macdonald, Nicola Tonellotto, Sean MacAvaney, Iadh Ounis. [PyTerrier: Declarative Experimentation in Python from BM25 to Dense Retrieval](https://dl.acm.org/doi/abs/10.1145/3459637.3482013). CIKM 2021.