Upload README.md
Browse files
README.md
CHANGED
@@ -1,109 +1,79 @@
|
|
1 |
---
|
2 |
-
license:
|
3 |
datasets:
|
4 |
- dmitva/human_ai_generated_text
|
5 |
-
language:
|
6 |
-
- en
|
7 |
-
widget:
|
8 |
-
- text: "This model trains on a diverse dataset and serves functions in applications requiring a mechanism for distinguishing between human and AI-generated text."
|
9 |
-
tags:
|
10 |
-
- nlp
|
11 |
-
- code
|
12 |
-
inference: false
|
13 |
---
|
14 |
|
15 |
-
|
16 |
|
17 |
-
The
|
18 |
|
19 |
-
|
20 |
|
21 |
-
```
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
```
|
24 |
|
25 |
-
|
26 |
|
27 |
```Python
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
[BOS_ID] +
|
32 |
-
tokenize("[CLS]") + tokenize("[INST]") + tokenize(USER_MESSAGE_1) + tokenize("[/INST]") +
|
33 |
-
tokenize(BOT_MESSAGE_1) + tokenize("[SEP]") +
|
34 |
-
…
|
35 |
-
tokenize("[INST]") + tokenize(USER_MESSAGE_N) + tokenize("[/INST]") +
|
36 |
-
tokenize(BOT_MESSAGE_N) + tokenize("[SEP]") + [EOS_ID]
|
37 |
-
```
|
38 |
-
|
39 |
-
Notes:
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
- The `tokenize_chinese_chars` parameter indicates special handling for Chinese characters.
|
47 |
-
- The maximum model length is set to 512 tokens.
|
48 |
-
|
49 |
-
## Installing Libraries
|
50 |
-
|
51 |
-
```sh
|
52 |
-
pip install torch transformers
|
53 |
-
```
|
54 |
|
55 |
-
|
|
|
|
|
|
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
|
61 |
-
|
|
|
62 |
|
63 |
-
|
64 |
-
|
|
|
65 |
|
66 |
# Input text
|
67 |
text = "This model trains on a diverse dataset and serves functions in applications requiring a mechanism for distinguishing between human and AI-generated text."
|
68 |
|
69 |
-
#
|
70 |
-
|
71 |
-
|
72 |
-
# Run the model
|
73 |
-
outputs = model(**inputs)
|
74 |
-
|
75 |
-
# Interpret the output
|
76 |
-
logits = outputs.logits
|
77 |
|
78 |
-
#
|
79 |
-
|
|
|
|
|
|
|
80 |
|
81 |
-
#
|
82 |
-
|
|
|
|
|
|
|
83 |
|
84 |
-
#
|
85 |
-
|
86 |
-
print(f"AI Probability: {ai_prob:.4f}")
|
87 |
|
88 |
-
#
|
89 |
-
if
|
90 |
-
print("The text is likely
|
91 |
else:
|
92 |
-
print("The text is likely
|
93 |
-
```
|
94 |
-
|
95 |
-
## Citation
|
96 |
-
|
97 |
-
Please cite the paper if you are using the resource for your work.
|
98 |
|
99 |
-
|
100 |
-
@misc{abiodunfinbarrsoketunji-agtd2023,
|
101 |
-
doi = {10.48550/arXiv.2311.15565},
|
102 |
-
url = {https://arxiv.org/abs/2311.15565},
|
103 |
-
author = {Abiodun Finbarrs Oketunji},
|
104 |
-
title = {Evaluating the Efficacy of Hybrid Deep Learning Models in Distinguishing AI-Generated Text},
|
105 |
-
publisher = {arXiv},
|
106 |
-
year = {2023},
|
107 |
-
copyright = {arXiv.org perpetual, non-exclusive license}
|
108 |
-
}
|
109 |
```
|
|
|
1 |
---
|
2 |
+
license: apache-2.0
|
3 |
datasets:
|
4 |
- dmitva/human_ai_generated_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
---
|
6 |
|
7 |
+
## 0xnu/AGTD-v0.2
|
8 |
|
9 |
+
The "0xnu/AGTD-v0.2" model represents a significant breakthrough in distinguishing between text written by humans and one generated by Artificial Intelligence (AI). It is rooted in sophisticated algorithms and offers exceptional accuracy and efficiency in text analysis and classification. Everything is detailed in the study and accessible [here](https://arxiv.org/abs/2311.15565).
|
10 |
|
11 |
+
### Training Details
|
12 |
|
13 |
+
```sh
|
14 |
+
Precision: 0.6269
|
15 |
+
Recall: 1.0000
|
16 |
+
F1-score: 0.7707
|
17 |
+
Accuracy: 0.7028
|
18 |
+
Confusion Matrix:
|
19 |
+
[[197 288]
|
20 |
+
[ 0 484]]
|
21 |
```
|
22 |
|
23 |
+
### Run the model
|
24 |
|
25 |
```Python
|
26 |
+
import os
|
27 |
+
os.environ["KERAS_BACKEND"] = "tensorflow"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
+
import keras
|
30 |
+
import tensorflow as tf
|
31 |
+
import pickle
|
32 |
+
import numpy as np
|
33 |
+
from huggingface_hub import hf_hub_download
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
# Hugging Face repository details
|
36 |
+
REPO_ID = "0xnu/AGTD-v0.1"
|
37 |
+
MODEL_FILENAME = "human_ai_text_classification_model.keras"
|
38 |
+
TOKENIZER_FILENAME = "tokenizer.pkl"
|
39 |
|
40 |
+
# Download the model and tokenizer
|
41 |
+
model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME)
|
42 |
+
tokenizer_path = hf_hub_download(repo_id=REPO_ID, filename=TOKENIZER_FILENAME)
|
43 |
|
44 |
+
# Load the model
|
45 |
+
model = keras.models.load_model(model_path)
|
46 |
|
47 |
+
# Load the tokenizer
|
48 |
+
with open(tokenizer_path, 'rb') as tokenizer_file:
|
49 |
+
tokenizer = pickle.load(tokenizer_file)
|
50 |
|
51 |
# Input text
|
52 |
text = "This model trains on a diverse dataset and serves functions in applications requiring a mechanism for distinguishing between human and AI-generated text."
|
53 |
|
54 |
+
# Parameters (these should match the training parameters)
|
55 |
+
MAX_LENGTH = 100000
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
+
# Tokenization function
|
58 |
+
def tokenize_text(text, tokenizer, max_length):
|
59 |
+
sequences = tokenizer.texts_to_sequences([text])
|
60 |
+
padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
|
61 |
+
return padded_sequence
|
62 |
|
63 |
+
# Prediction function
|
64 |
+
def predict_text(text, model, tokenizer, max_length):
|
65 |
+
processed_text = tokenize_text(text, tokenizer, max_length)
|
66 |
+
prediction = model.predict(processed_text)[0][0]
|
67 |
+
return prediction
|
68 |
|
69 |
+
# Make prediction
|
70 |
+
prediction = predict_text(text, model, tokenizer, MAX_LENGTH)
|
|
|
71 |
|
72 |
+
# Interpret results
|
73 |
+
if prediction >= 0.5:
|
74 |
+
print(f"The text is likely AI-generated (confidence: {prediction:.2f})")
|
75 |
else:
|
76 |
+
print(f"The text is likely human-written (confidence: {1-prediction:.2f})")
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
+
print(f"Raw prediction value: {prediction}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
```
|