File size: 3,488 Bytes
4529fd0 d09fda3 4529fd0 d09fda3 4529fd0 b6c11fd 4529fd0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import torch
from tqdm import tqdm
from transformers import BertTokenizerFast
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class DualBertModel: # use this class to encode your document and queries.
"""
This class can encode queries and documents.
"""
def __init__(self, **kwargs):
"""
Loads models found in "model/passage_model_3.pt", "model/query_model_3.pt",
"""
self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case = True)
# put models into eval mode
self.p_model = torch.load("model/passage_model_3.pt").eval()
self.q_model = torch.load("model/query_model_3.pt").eval()
def chunks(self, lst, n): # chunk list of strings
for i in tqdm(range(0, len(lst), n)):
yield lst[i:i+n]
def process(self, lst, query):
if query:
max_length = 30
else:
max_length = 450
encoded = self.tokenizer(
text=lst,
add_special_tokens=True,
padding='max_length',
return_attention_mask=True,
truncation=True,
max_length=max_length
)
input_ids = torch.tensor(encoded.get('input_ids')).to(device)
attention_mask = torch.tensor(encoded.get('attention_mask')).to(device)
return input_ids, attention_mask
def encode_queries(self, queries, batch_size: int, **kwargs):
"""
Encodes a list of strings (queries) and returns a list of encodings.
"""
with torch.no_grad():
to_return = []
for c in self.chunks(queries, batch_size):
ids, atm = self.process(c, True)
out = self.q_model(ids, atm)
to_return.append(out.cpu())
return torch.concat(to_return)
def encode_corpus(self, corpus, batch_size: int, **kwargs):
"""
Encodes a list of strings (documents) and returns a list of encodings.
"""
with torch.no_grad():
to_return = []
for c in self.chunks(corpus, batch_size):
# to_encode = [d['text'] for d in c]
ids, atm = self.process(c, False)
out = self.p_model(ids, atm)
to_return.append(out.cpu())
return torch.concat(to_return)
if __name__ == "__main__":
model = DualBertModel()
queries = ["What is deep learning"]
documents = ['The adjective "deep" in deep learning refers to the use of multiple layers in the network. Early work showed that a linear perceptron cannot be a universal classifier, but that a network with a nonpolynomial activation function with one hidden layer of unbounded width can. Deep learning is a modern variation which is concerned with an unbounded number of layers of bounded size, which permits practical application and optimized implementation, while retaining theoretical universality under mild conditions. In deep learning the layers are also permitted to be heterogeneous and to deviate widely from biologically informed connectionist models, for the sake of efficiency, trainability and understandability.']
encoded_queries = model.encode_queries(queries, 1)
encoded_docs = model.encode_corpus(documents, 1)
print("Distance between query-documents", torch.cdist(encoded_queries, encoded_docs)) |