|
import math |
|
|
|
similar_letters = {"й": "и", "ё": "е", "e": "е", "t": "т", "i": "l", "o": "о", "k": "к", "3": "з", "a": "а", "x": "х", "c": "с", "m": "м"} |
|
letters = "qwertyuiopasdfghjklzxcvbnmёйцукенгшщзхъфывапролджэячсмитьбю " |
|
|
|
def countwords(x): |
|
temp = {} |
|
for word in x: |
|
if word not in temp: |
|
temp[word] = 1 |
|
else: |
|
temp[word] += 1 |
|
return temp |
|
|
|
def add_dict(a, b): |
|
temp = {} |
|
for key in a: |
|
if key in b: |
|
temp[key] = a[key]+b[key] |
|
else: |
|
temp[key] = a[key] |
|
for key in b: |
|
if key not in a: |
|
temp[key] = b[key] |
|
return temp |
|
|
|
class Chatbot: |
|
def __init__(self, name=None, letter_replace: bool = True, data: dict = None, frequency_weight: float = 0, div_by_len: bool = False): |
|
self.name = name |
|
self.letter_replace = letter_replace |
|
self.frequency_weight = frequency_weight |
|
self.div_by_len = div_by_len |
|
self.model = {} |
|
if data is not None: |
|
self.train(data) |
|
def tokenize(self, text: str): |
|
preprocess = "" |
|
for x in text.lower(): |
|
if x in letters: |
|
if x in similar_letters and self.letter_replace: |
|
preprocess += similar_letters[x] |
|
else: |
|
preprocess += x |
|
else: |
|
preprocess += " "+x+" " |
|
return preprocess.split() |
|
def train(self, data: dict): |
|
lendata = len(data) |
|
lendata_div = 1/lendata |
|
for x in data: |
|
if data[x] not in self.model: |
|
self.model[data[x]] = {"word count": countwords(self.tokenize(x)), "probabilities": {}, "weight count": 1, "weight": 0} |
|
else: |
|
self.model[data[x]]["word count"] = add_dict(countwords(self.tokenize(x)), self.model[data[x]]["word count"]) |
|
self.model[data[x]]["weight count"] += 1 |
|
for x in self.model: |
|
probabilities = {} |
|
div = 1/math.fsum(list(self.model[x]["word count"].values())) |
|
for word in self.model[x]["word count"]: |
|
probabilities[word] = self.model[x]["word count"][word]*div |
|
self.model[x]["probabilities"] = probabilities |
|
self.model[x]["weight"] = self.model[x]["weight count"] * lendata_div |
|
def get_responses(self, text: str): |
|
tokens = self.tokenize(text) |
|
lentokens = len(tokens) |
|
lentokens_div = 1/lentokens |
|
scores = [] |
|
for choice in self.model: |
|
score = 0 |
|
for token in tokens: |
|
if token in self.model[choice]["probabilities"]: |
|
score += self.model[choice]["probabilities"][token] |
|
if self.div_by_len: |
|
score *= lentokens_div |
|
score *= self.frequency_weight*self.model[choice]["weight"] + (1-self.frequency_weight) |
|
scores.append((choice, score)) |
|
return sorted(scores, key=lambda x: x[1], reverse=True) |
|
def __call__(self, text: str): |
|
return self.get_responses(text)[0][0] |
|
|
|
if __name__ == "__main__": |
|
import json |
|
|
|
with open("dataset.json", "r") as file: |
|
data = json.load(file) |
|
|
|
cb = Chatbot(data=data) |
|
while True: |
|
message = input("User: ") |
|
response = cb(message) |
|
print("Chatbot:", response) |
|
if response == "Пока": |
|
break |
|
|