CrabInHoney commited on
Commit
2bdc67a
1 Parent(s): 45448bb

Upload 4 files

Browse files
README.md CHANGED
@@ -1,3 +1,66 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### 1D-CNN-MC-toxicity-classifier-ru
2
+ (One-Dimensional Convolutional Neural Network with Multi-Channel input)
3
+
4
+ Architectural visualization:
5
+
6
+ ![](https://i.imgur.com/skbLM6w.png)
7
+
8
+ Total parameters: 503249
9
+
10
+ ##### Test Accuracy: 94.44%
11
+ ##### Training Accuracy: 97.46%
12
+
13
+ This model is developed for binary classification of Cyrillic text.
14
+
15
+ ##### A dataset of 75093 negative rows and 75093 positive rows was used for training.
16
+
17
+ ##### Recommended length of the input sequence: 25 - 400 Cyrillic characters.
18
+
19
+ ##### Simplifications of the dataset strings:
20
+ Removing extra spaces.
21
+
22
+ Replacing capital letters with small letters. (Я -> я).
23
+
24
+ Removing any non-Cyrillic characters, including prefixes. (Remove: z, !, ., #, 4, &... etc)
25
+
26
+ Replacing ё with e.
27
+
28
+ ### Example of use:
29
+
30
+ import numpy as np
31
+ from tensorflow import keras
32
+ from tensorflow.keras.preprocessing.text import tokenizer_from_json
33
+ from safetensors.numpy import load_file
34
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
35
+ import os
36
+ import re
37
+ # Название папки, где хранится модель
38
+ model_dir = 'model'
39
+ max_len = 400
40
+ # Загрузка архитектуры модели
41
+ with open(os.path.join(model_dir, 'model_architecture.json'), 'r', encoding='utf-8') as json_file:
42
+ model_json = json_file.read()
43
+ model = keras.models.model_from_json(model_json)
44
+ # Загрузка весов из safetensors
45
+ state_dict = load_file(os.path.join(model_dir, 'model_weights.safetensors'))
46
+ weights = [state_dict[f'weight_{i}'] for i in range(len(state_dict))]
47
+ model.set_weights(weights)
48
+ # Загрузка токенизатора
49
+ with open(os.path.join(model_dir, 'tokenizer.json'), 'r', encoding='utf-8') as f:
50
+ tokenizer_json = f.read()
51
+ tokenizer = tokenizer_from_json(tokenizer_json)
52
+ def predict_toxicity(text):
53
+ sequences = tokenizer.texts_to_sequences([text])
54
+ padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
55
+ probability = model.predict(padded)[0][0]
56
+ class_label = "toxic" if probability >= 0.5 else "normal"
57
+ return class_label, probability
58
+ # Пример использования
59
+ text = "Да какой идиот сделал эту НС?"
60
+ class_label, probability = predict_toxicity(text)
61
+ print(f"Text: {text}")
62
+ print(f"Class: {class_label} ({probability:.2%})")
63
+
64
+ ###### Output:
65
+ Text: Да какой идиот сделал эту НС?
66
+ Class: toxic (99.35%)
model_architecture.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"module": "keras", "class_name": "Sequential", "config": {"name": "sequential", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "layers": [{"module": "keras.layers", "class_name": "InputLayer", "config": {"batch_shape": [8, 400], "dtype": "float32", "sparse": false, "name": "input_layer"}, "registered_name": null}, {"module": "keras.layers", "class_name": "Embedding", "config": {"name": "embedding", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "input_dim": 10002, "output_dim": 48, "embeddings_initializer": {"module": "keras.initializers", "class_name": "RandomUniform", "config": {"seed": null, "minval": -0.05, "maxval": 0.05}, "registered_name": null}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false}, "registered_name": null, "build_config": {"input_shape": [8, 400]}}, {"module": "keras.layers", "class_name": "Conv1D", "config": {"name": "conv1d", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "filters": 48, "kernel_size": [3], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [8, 400, 48]}}, {"module": "keras.layers", "class_name": "MaxPooling1D", "config": {"name": "max_pooling1d", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "pool_size": [2], "padding": "valid", "strides": [2], "data_format": "channels_last"}, "registered_name": null}, {"module": "keras.layers", "class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "filters": 16, "kernel_size": [5], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [8, 199, 48]}}, {"module": "keras.layers", "class_name": "MaxPooling1D", "config": {"name": "max_pooling1d_1", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "pool_size": [2], "padding": "valid", "strides": [2], "data_format": "channels_last"}, "registered_name": null}, {"module": "keras.layers", "class_name": "Flatten", "config": {"name": "flatten", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "data_format": "channels_last"}, "registered_name": null, "build_config": {"input_shape": [8, 97, 16]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "units": 8, "activation": "relu", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [8, 1552]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "dtype": {"module": "keras", "class_name": "DTypePolicy", "config": {"name": "float32"}, "registered_name": null}, "units": 1, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [8, 8]}}], "build_input_shape": [8, 400]}, "registered_name": null, "build_config": {"input_shape": [8, 400]}, "compile_config": {"optimizer": {"module": "keras.optimizers", "class_name": "Adam", "config": {"name": "adam", "learning_rate": 0.0010000000474974513, "weight_decay": null, "clipnorm": null, "global_clipnorm": null, "clipvalue": null, "use_ema": false, "ema_momentum": 0.99, "ema_overwrite_frequency": null, "loss_scale_factor": null, "gradient_accumulation_steps": null, "beta_1": 0.9, "beta_2": 0.999, "epsilon": 1e-07, "amsgrad": false}, "registered_name": null}, "loss": "binary_crossentropy", "loss_weights": null, "metrics": ["accuracy"], "weighted_metrics": null, "run_eagerly": false, "steps_per_execution": 1, "jit_compile": false}}
model_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8349a9b2f80b5c1a42053aea4a3959eb235d6c5845a585a0b38ccb1953f89f8c
3
+ size 2014060
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff