Spaces:
Runtime error
Runtime error
Draft: Dev package
Browse files- .gitlab-ci.yml +4 -4
- app.py +5 -4
- data/master_test_text2int.csv +0 -90
- {modules → mathtext_fastapi}/__init__.py +0 -0
- {data → mathtext_fastapi/data}/text2int_results.csv +3 -1
- {modules → mathtext_fastapi}/nlu.py +9 -14
- modules/sentiment.py +0 -8
- modules/text2int.py +0 -192
- pyproject.toml +43 -0
- requirements.txt +3 -12
- tests/test_text2int.py +11 -6
.gitlab-ci.yml
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
# Official Python language image.
|
2 |
-
|
3 |
-
image: python:3.
|
4 |
before_script:
|
5 |
- python -v
|
6 |
- pip install -r requirements.txt
|
7 |
script:
|
8 |
- pytest --verbose
|
9 |
|
10 |
-
|
11 |
-
image: python:3.
|
12 |
before_script:
|
13 |
- python -v
|
14 |
- pip install -r requirements.txt
|
|
|
1 |
# Official Python language image.
|
2 |
+
test_py38:
|
3 |
+
image: python:3.8
|
4 |
before_script:
|
5 |
- python -v
|
6 |
- pip install -r requirements.txt
|
7 |
script:
|
8 |
- pytest --verbose
|
9 |
|
10 |
+
test_py39:
|
11 |
+
image: python:3.9
|
12 |
before_script:
|
13 |
- python -v
|
14 |
- pip install -r requirements.txt
|
app.py
CHANGED
@@ -6,11 +6,11 @@ from fastapi import FastAPI, Request
|
|
6 |
from fastapi.responses import JSONResponse
|
7 |
from fastapi.staticfiles import StaticFiles
|
8 |
from fastapi.templating import Jinja2Templates
|
9 |
-
from pydantic import BaseModel
|
10 |
-
|
11 |
-
from modules.nlu import prepare_message_data_for_logging
|
12 |
from mathtext.sentiment import sentiment
|
13 |
from mathtext.text2int import text2int
|
|
|
|
|
|
|
14 |
|
15 |
app = FastAPI()
|
16 |
|
@@ -67,7 +67,7 @@ async def evaluate_user_message_with_nlu_api(request: Request):
|
|
67 |
|
68 |
int_api_resp = text2int(message_text)
|
69 |
|
70 |
-
if int_api_resp ==
|
71 |
sentiment_api_resp = sentiment(message_text)
|
72 |
# [{'label': 'POSITIVE', 'score': 0.991188645362854}]
|
73 |
sent_data_dict = {'type': 'sentiment', 'data': sentiment_api_resp[0]['label']}
|
@@ -76,4 +76,5 @@ async def evaluate_user_message_with_nlu_api(request: Request):
|
|
76 |
prepare_message_data_for_logging(message_data)
|
77 |
|
78 |
int_data_dict = {'type': 'integer', 'data': int_api_resp}
|
|
|
79 |
return JSONResponse(content=int_data_dict)
|
|
|
6 |
from fastapi.responses import JSONResponse
|
7 |
from fastapi.staticfiles import StaticFiles
|
8 |
from fastapi.templating import Jinja2Templates
|
|
|
|
|
|
|
9 |
from mathtext.sentiment import sentiment
|
10 |
from mathtext.text2int import text2int
|
11 |
+
from pydantic import BaseModel
|
12 |
+
|
13 |
+
from mathtext_fastapi.nlu import prepare_message_data_for_logging
|
14 |
|
15 |
app = FastAPI()
|
16 |
|
|
|
67 |
|
68 |
int_api_resp = text2int(message_text)
|
69 |
|
70 |
+
if int_api_resp == 32202:
|
71 |
sentiment_api_resp = sentiment(message_text)
|
72 |
# [{'label': 'POSITIVE', 'score': 0.991188645362854}]
|
73 |
sent_data_dict = {'type': 'sentiment', 'data': sentiment_api_resp[0]['label']}
|
|
|
76 |
prepare_message_data_for_logging(message_data)
|
77 |
|
78 |
int_data_dict = {'type': 'integer', 'data': int_api_resp}
|
79 |
+
|
80 |
return JSONResponse(content=int_data_dict)
|
data/master_test_text2int.csv
DELETED
@@ -1,90 +0,0 @@
|
|
1 |
-
input,output
|
2 |
-
fourteen,14
|
3 |
-
forteen,14
|
4 |
-
one thousand four hundred ninety two,1492
|
5 |
-
one thousand ninety two,1092
|
6 |
-
Fourteen Hundred Ninety-Two,1492
|
7 |
-
Fourteen Hundred,1400
|
8 |
-
Ninety nine,99
|
9 |
-
fifteen thousand five hundred-sixty,15560
|
10 |
-
three hundred fifty,350
|
11 |
-
one nine eight five,1985
|
12 |
-
nineteen eighty-five,1985
|
13 |
-
oh one,1
|
14 |
-
six oh 1,601
|
15 |
-
sex,6
|
16 |
-
six,6
|
17 |
-
eight oh,80
|
18 |
-
eighty,80
|
19 |
-
ate,8
|
20 |
-
double eight,88
|
21 |
-
eight three seven five three O nine,8375309
|
22 |
-
eight three seven five three oh nine,8375309
|
23 |
-
eight three seven five three zero nine,8375309
|
24 |
-
eight three seven five three oh ni-ee-ine,8375309
|
25 |
-
two eight,28
|
26 |
-
seven oh eleven,7011
|
27 |
-
seven elevens,77
|
28 |
-
seven eleven,711
|
29 |
-
ninety nine oh five,9905
|
30 |
-
seven 0 seven 0 seven 0 seven,7070707
|
31 |
-
123 hundred,123000
|
32 |
-
5 o 5,505
|
33 |
-
15 o 5,1505
|
34 |
-
15-o 5,1505
|
35 |
-
15 o-5,1505
|
36 |
-
911-thousand,911000
|
37 |
-
twenty-two twenty-two,2222
|
38 |
-
twenty-two twenty-twos,484
|
39 |
-
four eighty four,484
|
40 |
-
four eighties,320
|
41 |
-
four eighties and nine nineties,1130
|
42 |
-
ninety nine hundred and seventy seven,9977
|
43 |
-
seven thousands,7000
|
44 |
-
2 hundreds,200
|
45 |
-
99 thousands and one,99001
|
46 |
-
"forty-five thousand, seven hundred and nine",45709
|
47 |
-
eighty eight hundred eighty,8880
|
48 |
-
a hundred hundred,10000
|
49 |
-
a hundred thousand,100000
|
50 |
-
a hundred million,100000000
|
51 |
-
nineteen ninety nine,1999
|
52 |
-
forteen twenty seven,1427
|
53 |
-
seventeen-thousand and seventy two,17072
|
54 |
-
two hundred and nine,209
|
55 |
-
two thousand ten,2010
|
56 |
-
two thousand and ten,2010
|
57 |
-
twelve million,12000000
|
58 |
-
8 billion,8000000000
|
59 |
-
twenty ten,2010
|
60 |
-
thirty-two hundred,3200
|
61 |
-
nine,9
|
62 |
-
forty two,42
|
63 |
-
1 2 three,123
|
64 |
-
fourtean,14
|
65 |
-
one tousand four hundred ninty two,1492
|
66 |
-
Furteen Hundrd Ninety-Too,1492
|
67 |
-
forrteen,14
|
68 |
-
sevnteen-thosand and seventy two,17072
|
69 |
-
ninety nine hundred ad seventy seven,9977
|
70 |
-
seven thusands,7000
|
71 |
-
2 hunreds,200
|
72 |
-
99 tousands and one,99001
|
73 |
-
eighty ate hundred eighty,8880
|
74 |
-
fourteen Hundred,1400
|
75 |
-
8 Bilion,8000000000
|
76 |
-
one million three thousand one,1003001
|
77 |
-
four million nine thousand seven,4009007
|
78 |
-
two million five hundred thousand,2500000
|
79 |
-
two tousand ten,2010
|
80 |
-
two thousand teen,2010
|
81 |
-
tvelve milion,12000000
|
82 |
-
tventy ten,2010
|
83 |
-
tirty-twoo hunred,3200
|
84 |
-
sevn thoosands,7000
|
85 |
-
five,5
|
86 |
-
ten,10
|
87 |
-
one two three and ten,12310
|
88 |
-
ONE MILLion three hunded and fiv,1000305
|
89 |
-
"50,500 and six",50506
|
90 |
-
one_million_and_five,1000005
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{modules → mathtext_fastapi}/__init__.py
RENAMED
File without changes
|
{data → mathtext_fastapi/data}/text2int_results.csv
RENAMED
@@ -1,4 +1,6 @@
|
|
1 |
input,output,text2int,score
|
|
|
|
|
2 |
fourteen,14,14,True
|
3 |
forteen,14,14,True
|
4 |
one thousand four hundred ninety two,1492,1492,True
|
@@ -21,7 +23,7 @@ double eight,88,32202,False
|
|
21 |
eight three seven five three O nine,8375309,8375329,False
|
22 |
eight three seven five three oh nine,8375309,8375309,True
|
23 |
eight three seven five three zero nine,8375309,8375309,True
|
24 |
-
eight three seven five three oh ni-ee-ine,8375309,
|
25 |
two eight,28,16,False
|
26 |
seven oh eleven,7011,77,False
|
27 |
seven elevens,77,77,True
|
|
|
1 |
input,output,text2int,score
|
2 |
+
notanumber,32202,32202,True
|
3 |
+
this is not a number,32202,32202,True
|
4 |
fourteen,14,14,True
|
5 |
forteen,14,14,True
|
6 |
one thousand four hundred ninety two,1492,1492,True
|
|
|
23 |
eight three seven five three O nine,8375309,8375329,False
|
24 |
eight three seven five three oh nine,8375309,8375309,True
|
25 |
eight three seven five three zero nine,8375309,8375309,True
|
26 |
+
eight three seven five three oh ni-ee-ine,8375309,837530611,False
|
27 |
two eight,28,16,False
|
28 |
seven oh eleven,7011,77,False
|
29 |
seven elevens,77,77,True
|
{modules → mathtext_fastapi}/nlu.py
RENAMED
@@ -1,18 +1,13 @@
|
|
1 |
-
import environ
|
2 |
-
import json
|
3 |
import os
|
4 |
-
import requests
|
5 |
-
|
6 |
from datetime import datetime
|
|
|
|
|
7 |
from supabase import create_client
|
8 |
|
|
|
9 |
|
10 |
-
|
11 |
-
env = environ.Env()
|
12 |
-
env_path = os.path.join(BASE_DIR, '.env')
|
13 |
-
environ.Env.read_env('.env')
|
14 |
|
15 |
-
SUPA = create_client(env('SUPABASE_URL'), env('SUPABASE_KEY'))
|
16 |
|
17 |
def log_message_data_through_supabase_api(table_name, log_data):
|
18 |
return SUPA.table(table_name).insert(log_data).execute()
|
@@ -28,19 +23,19 @@ def prepare_message_data_for_logging(message_data):
|
|
28 |
# Autogenerated fields: id, created_at, modified_at
|
29 |
}
|
30 |
project_data_log = log_message_data_through_supabase_api('project', project_data)
|
31 |
-
|
32 |
contact_data = {
|
33 |
-
'project': project_data_log.data[0]['id'],
|
34 |
'original_contact_id': message_data['message']['_vnd']['v1']['chat']['contact_uuid'],
|
35 |
'urn': "",
|
36 |
'language_code': "en",
|
37 |
'contact_inserted_at': format_datetime_in_isoformat(datetime.now())
|
38 |
-
# Autogenerated fields: id, created_at, modified_at
|
39 |
}
|
40 |
contact_data_log = log_message_data_through_supabase_api('contact', contact_data)
|
41 |
|
42 |
message_data = {
|
43 |
-
'contact': contact_data_log.data[0]['id'],
|
44 |
'original_message_id': message_data['message']['id'],
|
45 |
'text': message_data['message']['text']['body'],
|
46 |
'direction': message_data['message']['_vnd']['v1']['direction'],
|
@@ -49,6 +44,6 @@ def prepare_message_data_for_logging(message_data):
|
|
49 |
'message_inserted_at': message_data['message']['_vnd']['v1']['chat']['inserted_at'],
|
50 |
'message_modified_at': message_data['message']['_vnd']['v1']['chat']['updated_at'],
|
51 |
'message_sent_at': format_datetime_in_isoformat(datetime.now())
|
52 |
-
# Autogenerated fields: created_at, modified_at
|
53 |
}
|
54 |
message_data_log = log_message_data_through_supabase_api('message', message_data)
|
|
|
|
|
|
|
1 |
import os
|
|
|
|
|
2 |
from datetime import datetime
|
3 |
+
|
4 |
+
from dotenv import load_dotenv
|
5 |
from supabase import create_client
|
6 |
|
7 |
+
load_dotenv()
|
8 |
|
9 |
+
SUPA = create_client(os.environ.get('SUPABASE_URL'), os.environ.get('SUPABASE_KEY'))
|
|
|
|
|
|
|
10 |
|
|
|
11 |
|
12 |
def log_message_data_through_supabase_api(table_name, log_data):
|
13 |
return SUPA.table(table_name).insert(log_data).execute()
|
|
|
23 |
# Autogenerated fields: id, created_at, modified_at
|
24 |
}
|
25 |
project_data_log = log_message_data_through_supabase_api('project', project_data)
|
26 |
+
|
27 |
contact_data = {
|
28 |
+
'project': project_data_log.data[0]['id'], # FK
|
29 |
'original_contact_id': message_data['message']['_vnd']['v1']['chat']['contact_uuid'],
|
30 |
'urn': "",
|
31 |
'language_code': "en",
|
32 |
'contact_inserted_at': format_datetime_in_isoformat(datetime.now())
|
33 |
+
# Autogenerated fields: id, created_at, modified_at
|
34 |
}
|
35 |
contact_data_log = log_message_data_through_supabase_api('contact', contact_data)
|
36 |
|
37 |
message_data = {
|
38 |
+
'contact': contact_data_log.data[0]['id'], # FK
|
39 |
'original_message_id': message_data['message']['id'],
|
40 |
'text': message_data['message']['text']['body'],
|
41 |
'direction': message_data['message']['_vnd']['v1']['direction'],
|
|
|
44 |
'message_inserted_at': message_data['message']['_vnd']['v1']['chat']['inserted_at'],
|
45 |
'message_modified_at': message_data['message']['_vnd']['v1']['chat']['updated_at'],
|
46 |
'message_sent_at': format_datetime_in_isoformat(datetime.now())
|
47 |
+
# Autogenerated fields: created_at, modified_at
|
48 |
}
|
49 |
message_data_log = log_message_data_through_supabase_api('message', message_data)
|
modules/sentiment.py
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
from transformers import pipeline
|
2 |
-
|
3 |
-
sentiment_obj = pipeline(task="sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
|
4 |
-
|
5 |
-
|
6 |
-
def sentiment(text):
|
7 |
-
# Returns sentiment value
|
8 |
-
return sentiment_obj(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modules/text2int.py
DELETED
@@ -1,192 +0,0 @@
|
|
1 |
-
import spacy # noqa
|
2 |
-
|
3 |
-
# import os
|
4 |
-
# os.environ['KMP_DUPLICATE_LIB_OK']='True'
|
5 |
-
# import spacy
|
6 |
-
|
7 |
-
# Change this according to what words should be corrected to
|
8 |
-
SPELL_CORRECT_MIN_CHAR_DIFF = 2
|
9 |
-
|
10 |
-
TOKENS2INT_ERROR_INT = 32202
|
11 |
-
|
12 |
-
ONES = [
|
13 |
-
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
|
14 |
-
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
|
15 |
-
"sixteen", "seventeen", "eighteen", "nineteen",
|
16 |
-
]
|
17 |
-
|
18 |
-
CHAR_MAPPING = {
|
19 |
-
"-": " ",
|
20 |
-
"_": " ",
|
21 |
-
"and": " ",
|
22 |
-
}
|
23 |
-
# CHAR_MAPPING.update((str(i), word) for i, word in enumerate([" " + s + " " for s in ONES]))
|
24 |
-
TOKEN_MAPPING = {
|
25 |
-
"and": " ",
|
26 |
-
"oh": "0",
|
27 |
-
}
|
28 |
-
|
29 |
-
|
30 |
-
def find_char_diff(a, b):
|
31 |
-
# Finds the character difference between two str objects by counting the occurences of every character. Not edit distance.
|
32 |
-
char_counts_a = {}
|
33 |
-
char_counts_b = {}
|
34 |
-
for char in a:
|
35 |
-
if char in char_counts_a.keys():
|
36 |
-
char_counts_a[char] += 1
|
37 |
-
else:
|
38 |
-
char_counts_a[char] = 1
|
39 |
-
for char in b:
|
40 |
-
if char in char_counts_b.keys():
|
41 |
-
char_counts_b[char] += 1
|
42 |
-
else:
|
43 |
-
char_counts_b[char] = 1
|
44 |
-
char_diff = 0
|
45 |
-
for i in char_counts_a:
|
46 |
-
if i in char_counts_b.keys():
|
47 |
-
char_diff += abs(char_counts_a[i] - char_counts_b[i])
|
48 |
-
else:
|
49 |
-
char_diff += char_counts_a[i]
|
50 |
-
return char_diff
|
51 |
-
|
52 |
-
|
53 |
-
def tokenize(text):
|
54 |
-
text = text.lower()
|
55 |
-
# print(text)
|
56 |
-
text = replace_tokens(''.join(i for i in replace_chars(text)).split())
|
57 |
-
# print(text)
|
58 |
-
text = [i for i in text if i != ' ']
|
59 |
-
# print(text)
|
60 |
-
output = []
|
61 |
-
for word in text:
|
62 |
-
# print(word)
|
63 |
-
output.append(convert_word_to_int(word))
|
64 |
-
output = [i for i in output if i != ' ']
|
65 |
-
# print(output)
|
66 |
-
return output
|
67 |
-
|
68 |
-
|
69 |
-
def detokenize(tokens):
|
70 |
-
return ' '.join(tokens)
|
71 |
-
|
72 |
-
|
73 |
-
def replace_tokens(tokens, token_mapping=TOKEN_MAPPING):
|
74 |
-
return [token_mapping.get(tok, tok) for tok in tokens]
|
75 |
-
|
76 |
-
|
77 |
-
def replace_chars(text, char_mapping=CHAR_MAPPING):
|
78 |
-
return [char_mapping.get(c, c) for c in text]
|
79 |
-
|
80 |
-
|
81 |
-
def convert_word_to_int(in_word, numwords={}):
|
82 |
-
# Converts a single word/str into a single int
|
83 |
-
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
|
84 |
-
scales = ["hundred", "thousand", "million", "billion", "trillion"]
|
85 |
-
if not numwords:
|
86 |
-
for idx, word in enumerate(ONES):
|
87 |
-
numwords[word] = idx
|
88 |
-
for idx, word in enumerate(tens):
|
89 |
-
numwords[word] = idx * 10
|
90 |
-
for idx, word in enumerate(scales):
|
91 |
-
numwords[word] = 10 ** (idx * 3 or 2)
|
92 |
-
if in_word in numwords:
|
93 |
-
# print(in_word)
|
94 |
-
# print(numwords[in_word])
|
95 |
-
return numwords[in_word]
|
96 |
-
try:
|
97 |
-
int(in_word)
|
98 |
-
return int(in_word)
|
99 |
-
except ValueError:
|
100 |
-
pass
|
101 |
-
# Spell correction using find_char_diff
|
102 |
-
char_diffs = [find_char_diff(in_word, i) for i in ONES + tens + scales]
|
103 |
-
min_char_diff = min(char_diffs)
|
104 |
-
if min_char_diff <= SPELL_CORRECT_MIN_CHAR_DIFF:
|
105 |
-
return char_diffs.index(min_char_diff)
|
106 |
-
|
107 |
-
|
108 |
-
def tokens2int(tokens):
|
109 |
-
# Takes a list of tokens and returns a int representation of them
|
110 |
-
types = []
|
111 |
-
for i in tokens:
|
112 |
-
if i <= 9:
|
113 |
-
types.append(1)
|
114 |
-
|
115 |
-
elif i <= 90:
|
116 |
-
types.append(2)
|
117 |
-
|
118 |
-
else:
|
119 |
-
types.append(3)
|
120 |
-
# print(tokens)
|
121 |
-
if len(tokens) <= 3:
|
122 |
-
current = 0
|
123 |
-
for i, number in enumerate(tokens):
|
124 |
-
if i != 0 and types[i] < types[i - 1] and current != tokens[i - 1] and types[i - 1] != 3:
|
125 |
-
current += tokens[i] + tokens[i - 1]
|
126 |
-
elif current <= tokens[i] and current != 0:
|
127 |
-
current *= tokens[i]
|
128 |
-
elif 3 not in types and 1 not in types:
|
129 |
-
current = int(''.join(str(i) for i in tokens))
|
130 |
-
break
|
131 |
-
elif '111' in ''.join(str(i) for i in types) and 2 not in types and 3 not in types:
|
132 |
-
current = int(''.join(str(i) for i in tokens))
|
133 |
-
break
|
134 |
-
else:
|
135 |
-
current += number
|
136 |
-
|
137 |
-
elif 3 not in types and 2 not in types:
|
138 |
-
current = int(''.join(str(i) for i in tokens))
|
139 |
-
|
140 |
-
else:
|
141 |
-
"""
|
142 |
-
double_list = []
|
143 |
-
current_double = []
|
144 |
-
double_type_list = []
|
145 |
-
for i in tokens:
|
146 |
-
if len(current_double) < 2:
|
147 |
-
current_double.append(i)
|
148 |
-
else:
|
149 |
-
double_list.append(current_double)
|
150 |
-
current_double = []
|
151 |
-
current_double = []
|
152 |
-
for i in types:
|
153 |
-
if len(current_double) < 2:
|
154 |
-
current_double.append(i)
|
155 |
-
else:
|
156 |
-
double_type_list.append(current_double)
|
157 |
-
current_double = []
|
158 |
-
print(double_type_list)
|
159 |
-
print(double_list)
|
160 |
-
current = 0
|
161 |
-
for i, type_double in enumerate(double_type_list):
|
162 |
-
if len(type_double) == 1:
|
163 |
-
current += double_list[i][0]
|
164 |
-
elif type_double[0] == type_double[1]:
|
165 |
-
current += int(str(double_list[i][0]) + str(double_list[i][1]))
|
166 |
-
elif type_double[0] > type_double[1]:
|
167 |
-
current += sum(double_list[i])
|
168 |
-
elif type_double[0] < type_double[1]:
|
169 |
-
current += double_list[i][0] * double_list[i][1]
|
170 |
-
#print(current)
|
171 |
-
"""
|
172 |
-
count = 0
|
173 |
-
current = 0
|
174 |
-
for i, token in enumerate(tokens):
|
175 |
-
count += 1
|
176 |
-
if count == 2:
|
177 |
-
if types[i - 1] == types[i]:
|
178 |
-
current += int(str(token) + str(tokens[i - 1]))
|
179 |
-
elif types[i - 1] > types[i]:
|
180 |
-
current += tokens[i - 1] + token
|
181 |
-
else:
|
182 |
-
current += tokens[i - 1] * token
|
183 |
-
count = 0
|
184 |
-
elif i == len(tokens) - 1:
|
185 |
-
current += token
|
186 |
-
|
187 |
-
return current
|
188 |
-
|
189 |
-
|
190 |
-
def text2int(text):
|
191 |
-
# Wraps all of the functions up into one
|
192 |
-
return tokens2int(tokenize(text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pyproject.toml
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "MathText_FastAPI"
|
3 |
+
version = "0.0.1"
|
4 |
+
authors = [
|
5 |
+
"Sebastian Larsen <[email protected]>",
|
6 |
+
"Çetin ÇAKIR <[email protected]>",
|
7 |
+
"Hobson Lane <[email protected]>",
|
8 |
+
]
|
9 |
+
description = "Natural Language Understanding (text processing) for math symbols, digits, and words with a Gradio user interface and REST API."
|
10 |
+
readme = "README.md"
|
11 |
+
# requires-python = ">=3.8"
|
12 |
+
license = "AGPL-3.0-or-later"
|
13 |
+
classifiers = [
|
14 |
+
"Programming Language :: Python :: 3",
|
15 |
+
"Programming Language :: Python :: 3.8",
|
16 |
+
"Programming Language :: Python :: 3.9",
|
17 |
+
"License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
|
18 |
+
"Operating System :: OS Independent",
|
19 |
+
]
|
20 |
+
|
21 |
+
|
22 |
+
[tool.poetry.dependencies]
|
23 |
+
mathtext = {git = "https://gitlab.com/tangibleai/community/mathtext", rev = "main"}
|
24 |
+
fastapi = "0.74.*"
|
25 |
+
pydantic = "*"
|
26 |
+
python = "^3.8,<3.10"
|
27 |
+
requests = "2.27.*"
|
28 |
+
sentencepiece = "0.1.*"
|
29 |
+
supabase = "*"
|
30 |
+
uvicorn = "0.17.*"
|
31 |
+
|
32 |
+
[tool.poetry.group.dev.dependencies]
|
33 |
+
pytest = "^7.2"
|
34 |
+
|
35 |
+
[build-system]
|
36 |
+
requires = ["poetry-core"]
|
37 |
+
build-backend = "poetry.core.masonry.api"
|
38 |
+
|
39 |
+
# [build-system]
|
40 |
+
# requires = ["hatchling"]
|
41 |
+
# build-backend = "hatchling.build"
|
42 |
+
|
43 |
+
# repository = "https://gitlab.com/tangibleai/community/mathtext-fastapi"
|
requirements.txt
CHANGED
@@ -1,16 +1,7 @@
|
|
|
|
1 |
fastapi==0.74.*
|
|
|
2 |
requests==2.27.*
|
3 |
sentencepiece==0.1.*
|
4 |
-
torch==1.12.*
|
5 |
-
transformers==4.24.*
|
6 |
-
uvicorn[standard]==0.17.*
|
7 |
-
pydantic
|
8 |
-
mathtext @ git+https://gitlab.com/tangibleai/community/mathtext@main
|
9 |
-
spacy==3.4.*
|
10 |
-
pandas==1.5.*
|
11 |
-
matplotlib==3.6.*
|
12 |
-
pytest==7.2.*
|
13 |
-
httpx==0.23.*
|
14 |
-
|
15 |
-
django-environ
|
16 |
supabase
|
|
|
|
1 |
+
mathtext @ git+https://gitlab.com/tangibleai/community/mathtext@main
|
2 |
fastapi==0.74.*
|
3 |
+
pydantic==1.10.*
|
4 |
requests==2.27.*
|
5 |
sentencepiece==0.1.*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
supabase
|
7 |
+
uvicorn==0.17.*
|
tests/test_text2int.py
CHANGED
@@ -1,11 +1,16 @@
|
|
1 |
import unittest
|
|
|
2 |
|
3 |
import pandas as pd
|
4 |
from fastapi.testclient import TestClient
|
5 |
|
6 |
from app import app
|
7 |
|
8 |
-
|
|
|
|
|
|
|
|
|
9 |
|
10 |
client = TestClient(app)
|
11 |
|
@@ -15,6 +20,7 @@ class TestStringMethods(unittest.TestCase):
|
|
15 |
def setUp(self):
|
16 |
"""Creates a fastapi test client"""
|
17 |
self.client = TestClient(app)
|
|
|
18 |
|
19 |
def get_response_text2int(self, text):
|
20 |
"""Makes a post request to the endpoint"""
|
@@ -35,15 +41,14 @@ class TestStringMethods(unittest.TestCase):
|
|
35 |
|
36 |
def test_acc_score_text2int(self):
|
37 |
"""Calculates accuracy score for endpoint"""
|
38 |
-
df = pd.read_csv(TEST_DATA_FILE)
|
39 |
|
40 |
-
df["text2int"] = df["input"].apply(func=self.get_response_text2int)
|
41 |
-
df["score"] = df[["output", "text2int"]].apply(
|
42 |
lambda row: row[0] == row[1],
|
43 |
axis=1
|
44 |
)
|
45 |
-
df.to_csv("
|
46 |
-
acc_score = df["score"].mean().__round__(2)
|
47 |
|
48 |
self.assertGreaterEqual(acc_score, 0.5, f"Accuracy score: '{acc_score}'. Value is too low!")
|
49 |
|
|
|
1 |
import unittest
|
2 |
+
from pathlib import Path
|
3 |
|
4 |
import pandas as pd
|
5 |
from fastapi.testclient import TestClient
|
6 |
|
7 |
from app import app
|
8 |
|
9 |
+
# The raw file URL has to be used for GitLab.
|
10 |
+
URL = "https://gitlab.com/tangibleai/community/mathtext/-/raw/main/mathtext/data/master_test_text2int.csv"
|
11 |
+
|
12 |
+
DATA_DIR = Path(__file__).parent.parent / "mathtext_fastapi" / "data"
|
13 |
+
print(DATA_DIR)
|
14 |
|
15 |
client = TestClient(app)
|
16 |
|
|
|
20 |
def setUp(self):
|
21 |
"""Creates a fastapi test client"""
|
22 |
self.client = TestClient(app)
|
23 |
+
self.df = pd.read_csv(URL)
|
24 |
|
25 |
def get_response_text2int(self, text):
|
26 |
"""Makes a post request to the endpoint"""
|
|
|
41 |
|
42 |
def test_acc_score_text2int(self):
|
43 |
"""Calculates accuracy score for endpoint"""
|
|
|
44 |
|
45 |
+
self.df["text2int"] = self.df["input"].apply(func=self.get_response_text2int)
|
46 |
+
self.df["score"] = self.df[["output", "text2int"]].apply(
|
47 |
lambda row: row[0] == row[1],
|
48 |
axis=1
|
49 |
)
|
50 |
+
self.df.to_csv(f"{DATA_DIR}/text2int_results.csv", index=False)
|
51 |
+
acc_score = self.df["score"].mean().__round__(2)
|
52 |
|
53 |
self.assertGreaterEqual(acc_score, 0.5, f"Accuracy score: '{acc_score}'. Value is too low!")
|
54 |
|