serdarakyol
commited on
Commit
•
48b9a4a
1
Parent(s):
fcd822a
Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language: tr
|
3 |
+
Dataset: interpress_news_category_tr
|
4 |
+
---
|
5 |
+
# INTERPRESS NEWS CLASSIFICATION
|
6 |
+
## Dataset
|
7 |
+
The dataset downloaded from interpress. This dataset is real world data. Actually there are 273K data but I filtered them and used 108K data for this model. For more information about dataset please visit this [link](https://huggingface.co/datasets/interpress_news_category_tr)
|
8 |
+
|
9 |
+
## Model
|
10 |
+
Model accuracy on train data and validation data is %97.
|
11 |
+
|
12 |
+
## Usage
|
13 |
+
```sh
|
14 |
+
pip install transformers or pip install transfomers==4.3.3
|
15 |
+
```
|
16 |
+
```sh
|
17 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
18 |
+
|
19 |
+
tokenizer = AutoTokenizer.from_pretrained("Serdar/bert-model")
|
20 |
+
|
21 |
+
model = AutoModelForSequenceClassification.from_pretrained("Serdar/bert-model")
|
22 |
+
```
|
23 |
+
```sh
|
24 |
+
# PREPROCESSING
|
25 |
+
import re
|
26 |
+
my_punc = r"#$%&()*+-/:;<=>@[\]^_{|}~"
|
27 |
+
|
28 |
+
def clean_url(content):
|
29 |
+
reg_url=r'[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)[\S]*\s?'
|
30 |
+
pattern_url = re.compile(reg_url)
|
31 |
+
content = pattern_url.sub('',content)
|
32 |
+
return content
|
33 |
+
|
34 |
+
def clean_email(content):
|
35 |
+
reg_email='\S*@\S*\s?'
|
36 |
+
pattern_email = re.compile(reg_email)
|
37 |
+
content = pattern_email.sub('',content)
|
38 |
+
return content
|
39 |
+
|
40 |
+
def clean_punctuation(content):
|
41 |
+
content = content.translate(content.maketrans("", "", my_punc))
|
42 |
+
return content
|
43 |
+
|
44 |
+
def clean_data(text):
|
45 |
+
text = clean_url(text)
|
46 |
+
text = clean_email(text)
|
47 |
+
text = clean_punctuation(text)
|
48 |
+
|
49 |
+
filtered_sentence = []
|
50 |
+
for word in text.split(" "):
|
51 |
+
if len(word) > 2:
|
52 |
+
filtered_sentence.append(word)
|
53 |
+
|
54 |
+
text = ' '.join(filtered_sentence)
|
55 |
+
return text
|
56 |
+
```
|
57 |
+
```sh
|
58 |
+
import torch
|
59 |
+
import numpy as np
|
60 |
+
|
61 |
+
if torch.cuda.is_available():
|
62 |
+
device = torch.device("cuda")
|
63 |
+
model = model.cuda()
|
64 |
+
print('There are %d GPU(s) available.' % torch.cuda.device_count())
|
65 |
+
print('GPU name is:', torch.cuda.get_device_name(0))
|
66 |
+
else:
|
67 |
+
print('No GPU available, using the CPU instead.')
|
68 |
+
device = torch.device("cpu")
|
69 |
+
```
|
70 |
+
```sh
|
71 |
+
def prediction(news):
|
72 |
+
news=clean_data(news)
|
73 |
+
news=[news]
|
74 |
+
indices=tokenizer.batch_encode_plus(
|
75 |
+
news,
|
76 |
+
max_length=512,
|
77 |
+
add_special_tokens=True,
|
78 |
+
return_attention_mask=True,
|
79 |
+
padding='max_length',
|
80 |
+
truncation=True,
|
81 |
+
return_tensors='pt') # for tf tensors, switch pt to tf
|
82 |
+
|
83 |
+
inputs = indices["input_ids"].clone().detach().to(device)
|
84 |
+
masks = indices["attention_mask"].clone().detach().to(device)
|
85 |
+
|
86 |
+
with torch.no_grad():
|
87 |
+
output = model(inputs, token_type_ids=None,attention_mask=masks)
|
88 |
+
|
89 |
+
logits = output[0]
|
90 |
+
logits = logits.detach().cpu().numpy()
|
91 |
+
pred = np.argmax(logits,axis=1)[0]
|
92 |
+
return pred
|
93 |
+
```
|
94 |
+
|
95 |
+
```sh
|
96 |
+
labels = {
|
97 |
+
0 : "Culture-Art",
|
98 |
+
1 : "Economy",
|
99 |
+
2 : "Politics",
|
100 |
+
3 : "Education",
|
101 |
+
4 : "World",
|
102 |
+
5 : "Sport",
|
103 |
+
6 : "Technology",
|
104 |
+
7 : "Magazine",
|
105 |
+
8 : "Health",
|
106 |
+
9 : "Agenda"
|
107 |
+
}
|
108 |
+
pred = prediction(news)
|
109 |
+
print(labels[pred])
|
110 |
+
```
|
111 |
+
Thanks to @yavuzkomecoglu for contributes
|
112 |
+
|
113 |
+
If you have any question, please, don't hesitate to contact with me
|
114 |
+
[![linkedin](https://img.shields.io/badge/LinkedIn-0077B5?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/serdarakyol55/)
|
115 |
+
[![Github](https://img.shields.io/badge/GitHub-100000?style=for-the-badge&logo=github&logoColor=white)](https://github.com/serdarakyol)
|