Fariddwi commited on
Commit
5139c5a
1 Parent(s): 626f3d2

Upload 6 files

Browse files
Files changed (6) hide show
  1. Dockerfile +21 -0
  2. TextSimilarity.ipynb +141 -0
  3. TextSimilarity.py +28 -0
  4. app.py +51 -0
  5. requirements.txt +8 -0
  6. utils.py +17 -0
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Base image
2
+ FROM python:3.9
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Copy the application files
8
+ COPY app.py .
9
+ COPY TextSimilarity.py .
10
+ COPY requirements.txt .
11
+ COPY utils.py .
12
+
13
+ # Install dependencies
14
+ RUN pip3 install -r requirements.txt
15
+ RUN python3 download_model.py
16
+
17
+ # Expose the port
18
+ EXPOSE 8000
19
+
20
+ # Run the application
21
+ CMD ["python3", "app.py"]
TextSimilarity.ipynb ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']\n",
13
+ "- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
14
+ "- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
15
+ ]
16
+ },
17
+ {
18
+ "name": "stdout",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "torch.Size([4, 768])\n"
22
+ ]
23
+ }
24
+ ],
25
+ "source": [
26
+ "from multilingual_clip import pt_multilingual_clip\n",
27
+ "import transformers\n",
28
+ "\n",
29
+ "texts = [\n",
30
+ " 'Three blind horses listening to Mozart.',\n",
31
+ " 'Älgen är skogens konung!',\n",
32
+ " 'Wie leben Eisbären in der Antarktis?',\n",
33
+ " 'Вы знали, что все белые медведи левши?'\n",
34
+ "]\n",
35
+ "model_name = 'M-CLIP/XLM-Roberta-Large-Vit-L-14'\n",
36
+ "\n",
37
+ "# Load Model & Tokenizer\n",
38
+ "model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(model_name)\n",
39
+ "tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)\n",
40
+ "\n",
41
+ "embeddings = model.forward(texts, tokenizer)\n",
42
+ "print(embeddings.shape)"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": null,
48
+ "metadata": {},
49
+ "outputs": [],
50
+ "source": [
51
+ "model"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": 23,
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "texts = [\n",
61
+ " 'Aku sayang kamu',\n",
62
+ " 'Aku benci kamu',\n",
63
+ "]\n",
64
+ "embeddings = model.forward(texts, tokenizer)"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 24,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "embeddings_1, embeddings_2 = embeddings\n",
74
+ "embeddings_1 = embeddings_1.cpu().detach().numpy()\n",
75
+ "embeddings_2 = embeddings_2.cpu().detach().numpy()"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 25,
81
+ "metadata": {},
82
+ "outputs": [],
83
+ "source": [
84
+ "import numpy as np\n",
85
+ "from numpy.linalg import norm"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 26,
91
+ "metadata": {},
92
+ "outputs": [
93
+ {
94
+ "name": "stdout",
95
+ "output_type": "stream",
96
+ "text": [
97
+ "0.967305\n"
98
+ ]
99
+ }
100
+ ],
101
+ "source": [
102
+ "cosine = np.dot(embeddings_1,embeddings_2)/(norm(embeddings_1)*norm(embeddings_2))\n",
103
+ "print(cosine)"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": null,
109
+ "metadata": {},
110
+ "outputs": [],
111
+ "source": []
112
+ }
113
+ ],
114
+ "metadata": {
115
+ "kernelspec": {
116
+ "display_name": "base",
117
+ "language": "python",
118
+ "name": "python3"
119
+ },
120
+ "language_info": {
121
+ "codemirror_mode": {
122
+ "name": "ipython",
123
+ "version": 3
124
+ },
125
+ "file_extension": ".py",
126
+ "mimetype": "text/x-python",
127
+ "name": "python",
128
+ "nbconvert_exporter": "python",
129
+ "pygments_lexer": "ipython3",
130
+ "version": "3.9.13"
131
+ },
132
+ "orig_nbformat": 4,
133
+ "vscode": {
134
+ "interpreter": {
135
+ "hash": "ad2bdc8ecc057115af97d19610ffacc2b4e99fae6737bb82f5d7fb13d2f2c186"
136
+ }
137
+ }
138
+ },
139
+ "nbformat": 4,
140
+ "nbformat_minor": 2
141
+ }
TextSimilarity.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from multilingual_clip import pt_multilingual_clip
2
+ from numpy.linalg import norm
3
+ import transformers
4
+ import numpy as np
5
+ import torch
6
+
7
+ # M-CLIP/XLM-Roberta-Large-Vit-L-14
8
+ # M-CLIP/XLM-Roberta-Large-Vit-B-16Plus
9
+
10
+
11
+ class TextSimilarity:
12
+ def __init__(self, name_model="M-CLIP/XLM-Roberta-Large-Vit-B-32"):
13
+ self.name_model = name_model
14
+ self.device = torch.device(
15
+ "cuda:0" if torch.cuda.is_available() else "cpu")
16
+ self.tokenizer = transformers.AutoTokenizer.from_pretrained(
17
+ self.name_model)
18
+ self.model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(
19
+ self.name_model)
20
+ self.model.eval()
21
+
22
+ def predict(self, text_1, text_2):
23
+ with torch.no_grad():
24
+ embeddings = self.model.forward([text_1, text_2], self.tokenizer)
25
+ embeddings_1, embeddings_2 = embeddings.cpu().detach().numpy()
26
+ cosine = np.dot(embeddings_1, embeddings_2) / \
27
+ (norm(embeddings_1)*norm(embeddings_2))
28
+ return cosine
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uvicorn
2
+ from utils import check_score
3
+ from pydantic import BaseModel
4
+ from fastapi import FastAPI, status
5
+ from TextSimilarity import TextSimilarity
6
+ from starlette.responses import JSONResponse
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+
9
+
10
+ class RequestBody(BaseModel):
11
+ text_1: str
12
+ text_2: str
13
+
14
+
15
+ app = FastAPI(docs_url=None, redoc_url=None)
16
+ text_similarity = TextSimilarity()
17
+
18
+ origins = ['*']
19
+
20
+ app.add_middleware(
21
+ CORSMiddleware,
22
+ allow_origins=origins,
23
+ allow_credentials=True,
24
+ allow_methods=["*"],
25
+ allow_headers=["*"],
26
+
27
+ )
28
+
29
+
30
+ @app.post("/penilaian")
31
+ async def penilaian(data_request: RequestBody):
32
+ if not data_request.text_2:
33
+ return JSONResponse({
34
+ "probability": 0,
35
+ "score": 0
36
+ }, status_code=status.HTTP_200_OK)
37
+ try:
38
+ probability = text_similarity.predict(
39
+ data_request.text_1, data_request.text_2)
40
+ return_value = check_score(float(probability))
41
+ return JSONResponse(
42
+ return_value, status_code=status.HTTP_200_OK)
43
+ except Exception as e:
44
+ print(e)
45
+ return JSONResponse({
46
+ "errors": "Please contact your administrator"
47
+ }, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
48
+
49
+
50
+ if __name__ == "__main__":
51
+ uvicorn.run(app, host="localhost", port=53640)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pydantic==1.8.2
2
+ starlette==0.14.2
3
+ transformers==4.30.0
4
+ numpy==3.9
5
+ uvicorn[standard]
6
+ fastapi
7
+ torch>=1.13.1
8
+ torchvision>=0.14.1
utils.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def check_score(probability: float):
2
+ return_value = {
3
+ "probability": probability,
4
+ "score": 0,
5
+ }
6
+ if 0.95 <= probability <= 1:
7
+ return_value["score"] = 4
8
+ return return_value
9
+ elif 0.89 <= probability <= 0.95:
10
+ return_value["score"] = 3
11
+ return return_value
12
+ elif 0.75 <= probability <= 0.89:
13
+ return_value["score"] = 2
14
+ return return_value
15
+ elif probability <= 0.75:
16
+ return_value["score"] = 1
17
+ return return_value