|
--- |
|
language: |
|
- en |
|
tags: |
|
- feature-extraction |
|
- pubmed |
|
- sentence-similarity |
|
datasets: |
|
- biu-nlp/abstract-sim-pubmed |
|
--- |
|
|
|
A model for mapping abstract sentence descriptions to sentences that fit the descriptions. Trained on Pubmed sentences. Use ```load_finetuned_model``` to load the query and sentence encoder, and ```encode_batch()``` to encode a sentence with the model. |
|
|
|
```python |
|
|
|
from transformers import AutoTokenizer, AutoModel |
|
import torch |
|
|
|
def load_finetuned_model(): |
|
|
|
|
|
sentence_encoder = AutoModel.from_pretrained("biu-nlp/abstract-sim-sentence-pubmed", revision="71f4539120e29024adc618173a1ed5fd230ac249") |
|
query_encoder = AutoModel.from_pretrained("biu-nlp/abstract-sim-query-pubmed", revision="8d34676d80a39bcbc5a1d2eec13e6f8078496215") |
|
tokenizer = AutoTokenizer.from_pretrained("biu-nlp/abstract-sim-sentence-pubmed") |
|
return tokenizer, query_encoder, sentence_encoder |
|
|
|
|
|
def encode_batch(model, tokenizer, sentences, device): |
|
input_ids = tokenizer(sentences, padding=True, max_length=128, truncation=True, return_tensors="pt", |
|
add_special_tokens=True).to(device) |
|
features = model(**input_ids)[0] |
|
|
|
features = torch.sum(features[:,:,:] * input_ids["attention_mask"][:,:].unsqueeze(-1), dim=1) / torch.clamp(torch.sum(input_ids["attention_mask"][:,:], dim=1, keepdims=True), min=1e-9) |
|
|
|
return features |
|
|
|
``` |