CrabInHoney
commited on
Commit
•
2bdc67a
1
Parent(s):
45448bb
Upload 4 files
Browse files- README.md +66 -3
- model_architecture.json +1 -0
- model_weights.safetensors +3 -0
- tokenizer.json +0 -0
README.md
CHANGED
@@ -1,3 +1,66 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#### 1D-CNN-MC-toxicity-classifier-ru
|
2 |
+
(One-Dimensional Convolutional Neural Network with Multi-Channel input)
|
3 |
+
|
4 |
+
Architectural visualization:
|
5 |
+
|
6 |
+
![](https://i.imgur.com/skbLM6w.png)
|
7 |
+
|
8 |
+
Total parameters: 503249
|
9 |
+
|
10 |
+
##### Test Accuracy: 94.44%
|
11 |
+
##### Training Accuracy: 97.46%
|
12 |
+
|
13 |
+
This model is developed for binary classification of Cyrillic text.
|
14 |
+
|
15 |
+
##### A dataset of 75093 negative rows and 75093 positive rows was used for training.
|
16 |
+
|
17 |
+
##### Recommended length of the input sequence: 25 - 400 Cyrillic characters.
|
18 |
+
|
19 |
+
##### Simplifications of the dataset strings:
|
20 |
+
Removing extra spaces.
|
21 |
+
|
22 |
+
Replacing capital letters with small letters. (Я -> я).
|
23 |
+
|
24 |
+
Removing any non-Cyrillic characters, including prefixes. (Remove: z, !, ., #, 4, &... etc)
|
25 |
+
|
26 |
+
Replacing ё with e.
|
27 |
+
|
28 |
+
### Example of use:
|
29 |
+
|
30 |
+
import numpy as np
|
31 |
+
from tensorflow import keras
|
32 |
+
from tensorflow.keras.preprocessing.text import tokenizer_from_json
|
33 |
+
from safetensors.numpy import load_file
|
34 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
35 |
+
import os
|
36 |
+
import re
|
37 |
+
# Название папки, где хранится модель
|
38 |
+
model_dir = 'model'
|
39 |
+
max_len = 400
|
40 |
+
# Загрузка архитектуры модели
|
41 |
+
with open(os.path.join(model_dir, 'model_architecture.json'), 'r', encoding='utf-8') as json_file:
|
42 |
+
model_json = json_file.read()
|
43 |
+
model = keras.models.model_from_json(model_json)
|
44 |
+
# Загрузка весов из safetensors
|
45 |
+
state_dict = load_file(os.path.join(model_dir, 'model_weights.safetensors'))
|
46 |
+
weights = [state_dict[f'weight_{i}'] for i in range(len(state_dict))]
|
47 |
+
model.set_weights(weights)
|
48 |
+
# Загрузка токенизатора
|
49 |
+
with open(os.path.join(model_dir, 'tokenizer.json'), 'r', encoding='utf-8') as f:
|
50 |
+
tokenizer_json = f.read()
|
51 |
+
tokenizer = tokenizer_from_json(tokenizer_json)
|
52 |
+
def predict_toxicity(text):
|
53 |
+
sequences = tokenizer.texts_to_sequences([text])
|
54 |
+
padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
|
55 |
+
probability = model.predict(padded)[0][0]
|
56 |
+
class_label = "toxic" if probability >= 0.5 else "normal"
|
57 |
+
return class_label, probability
|
58 |
+
# Пример использования
|
59 |
+
text = "Да какой идиот сделал эту НС?"
|
60 |
+
class_label, probability = predict_toxicity(text)
|
61 |
+
print(f"Text: {text}")
|
62 |
+
print(f"Class: {class_label} ({probability:.2%})")
|
63 |
+
|
64 |
+
###### Output:
|
65 |
+
Text: Да какой идиот сделал эту НС?
|
66 |
+
Class: toxic (99.35%)
|
model_architecture.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"module": "keras", "class_name": "Sequential", "config": {"name": "sequential", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "layers": [{"module": "keras.layers", "class_name": "InputLayer", "config": {"batch_shape": [8, 400], "dtype": "float32", "sparse": false, "name": "input_layer"}, "registered_name": null}, {"module": "keras.layers", "class_name": "Embedding", "config": {"name": "embedding", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "input_dim": 10002, "output_dim": 48, "embeddings_initializer": {"module": "keras.initializers", "class_name": "RandomUniform", "config": {"seed": null, "minval": -0.05, "maxval": 0.05}, "registered_name": null}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false}, "registered_name": null, "build_config": {"input_shape": [8, 400]}}, {"module": "keras.layers", "class_name": "Conv1D", "config": {"name": "conv1d", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "filters": 48, "kernel_size": [3], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [8, 400, 48]}}, {"module": "keras.layers", "class_name": "MaxPooling1D", "config": {"name": "max_pooling1d", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "pool_size": [2], "padding": "valid", "strides": [2], "data_format": "channels_last"}, "registered_name": null}, {"module": "keras.layers", "class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "filters": 16, "kernel_size": [5], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [8, 199, 48]}}, {"module": "keras.layers", "class_name": "MaxPooling1D", "config": {"name": "max_pooling1d_1", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "pool_size": [2], "padding": "valid", "strides": [2], "data_format": "channels_last"}, "registered_name": null}, {"module": "keras.layers", "class_name": "Flatten", "config": {"name": "flatten", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "data_format": "channels_last"}, "registered_name": null, "build_config": {"input_shape": [8, 97, 16]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "units": 8, "activation": "relu", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [8, 1552]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "units": 1, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [8, 8]}}], "build_input_shape": [8, 400]}, "registered_name": null, "build_config": {"input_shape": [8, 400]}, "compile_config": {"optimizer": {"module": "keras.optimizers", "class_name": "Adam", "config": {"name": "adam", "learning_rate": 0.0010000000474974513, "weight_decay": null, "clipnorm": null, "global_clipnorm": null, "clipvalue": null, "use_ema": false, "ema_momentum": 0.99, "ema_overwrite_frequency": null, "loss_scale_factor": null, "gradient_accumulation_steps": null, "beta_1": 0.9, "beta_2": 0.999, "epsilon": 1e-07, "amsgrad": false}, "registered_name": null}, "loss": "binary_crossentropy", "loss_weights": null, "metrics": ["accuracy"], "weighted_metrics": null, "run_eagerly": false, "steps_per_execution": 1, "jit_compile": false}}
|
model_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8349a9b2f80b5c1a42053aea4a3959eb235d6c5845a585a0b38ccb1953f89f8c
|
3 |
+
size 2014060
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|