MARI-posa commited on
Commit
83a648d
1 Parent(s): 7072652

Upload 2 files

Browse files
Files changed (2) hide show
  1. book_train.csv +0 -0
  2. stri.py +65 -0
book_train.csv ADDED
The diff for this file is too large to render. See raw diff
 
stri.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import numpy as np
4
+ import pandas as pd
5
+ from transformers import AutoTokenizer, AutoModel
6
+
7
+ st.title("Книжные рекомендации")
8
+
9
+ # Загрузка модели и токенизатора
10
+ model_name = "cointegrated/rubert-tiny2"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ model = AutoModel.from_pretrained(model_name, output_hidden_states=True)
13
+
14
+ # Загрузка датасета и аннотаций к книгам
15
+ books = pd.read_csv('book_train.csv')
16
+ annot = books['annotation']
17
+
18
+ # Предобработка аннотаций и получение эмбеддингов
19
+ embeddings = []
20
+ for annotation in annot:
21
+ annotation_tokens = tokenizer.encode_plus(
22
+ annotation,
23
+ add_special_tokens=True,
24
+ max_length=128,
25
+ pad_to_max_length=True,
26
+ return_tensors='pt'
27
+ )
28
+
29
+ with torch.no_grad():
30
+ outputs = model(**annotation_tokens)
31
+ hidden_states = outputs.hidden_states
32
+ last_hidden_state = hidden_states[-2]
33
+ embeddings.append(torch.mean(last_hidden_state, dim=1).squeeze())
34
+
35
+ # Получение эмбеддинга запроса от пользователя
36
+ query = st.text_input("Введите запрос")
37
+ query_tokens = tokenizer.encode_plus(
38
+ query,
39
+ add_special_tokens=True,
40
+ max_length=128,
41
+ pad_to_max_length=True,
42
+ return_tensors='pt'
43
+ )
44
+
45
+ # Проверка, был ли введен запрос
46
+ if query:
47
+ with torch.no_grad():
48
+ query_outputs = model(**query_tokens)
49
+ query_hidden_states = query_outputs.hidden_states
50
+ query_last_hidden_state = query_hidden_states[-2]
51
+ query_embedding = torch.mean(query_last_hidden_state, dim=1).squeeze()
52
+
53
+ # Вычисление косинусного расстояния между эмбеддингом запроса и каждой аннотацией
54
+ cosine_similarities = torch.nn.functional.cosine_similarity(
55
+ query_embedding.unsqueeze(0),
56
+ torch.stack(embeddings)
57
+ )
58
+
59
+ cosine_similarities = cosine_similarities.numpy()
60
+
61
+ indices = np.argsort(cosine_similarities)[::-1]
62
+
63
+ st.header("Рекомендации")
64
+ for i in indices[:10]:
65
+ st.write(books['title'][i])