# Importing necessary libraries from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, TextClassificationPipeline from sklearn.metrics import accuracy_score, precision_recall_fscore_support import gradio as gr # Load the dataset ds = load_dataset("GonzaloA/fake_news") # Load pre-trained tokenizer tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Define tokenization function def tokenize_function(examples): return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128) # Apply tokenization tokenized_datasets = ds.map(tokenize_function, batched=True) # Load pre-trained BERT model for sequence classification model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) # Define training arguments training_args = TrainingArguments( output_dir='./results', num_train_epochs=3, per_device_train_batch_size=8, per_device_eval_batch_size=8, evaluation_strategy='epoch', logging_dir='./logs', ) # Create trainer instance trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets['train'].shuffle().select(range(1000)), eval_dataset=tokenized_datasets['test'].shuffle().select(range(1000)), ) # Start training trainer.train() # Define function to compute metrics def compute_metrics(pred): labels = pred.label_ids preds = pred.predictions.argmax(-1) precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary') acc = accuracy_score(labels, preds) return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall} # Update trainer to include custom metrics trainer.compute_metrics = compute_metrics # Evaluate the model eval_result = trainer.evaluate() print(eval_result) # Save the fine-tuned model and tokenizer trainer.save_model('TeamQuad-fine-tuned-bert') tokenizer.save_pretrained('TeamQuad-fine-tuned-bert') # Load the fine-tuned model and tokenizer new_model = AutoModelForSequenceClassification.from_pretrained('TeamQuad-fine-tuned-bert') new_tokenizer = AutoTokenizer.from_pretrained('TeamQuad-fine-tuned-bert') # Create a classification pipeline classifier = TextClassificationPipeline(model=new_model, tokenizer=new_tokenizer) # Add label mapping for fake news detection (assuming LABEL_0 = 'fake' and LABEL_1 = 'true') label_mapping = {0: 'fake', 1: 'true'} # Function to classify input text def classify_news(text): result = classifier(text) # Extract the label and score label = result[0]['label'] # 'LABEL_0' or 'LABEL_1' score = result[0]['score'] # Confidence score mapped_result = {'label': label_mapping[int(label.split('_')[1])], 'score': score} return f"Label: {mapped_result['label']}, Score: {mapped_result['score']:.4f}" # Create a Gradio interface iface = gr.Interface( fn=classify_news, # The function to process the input inputs=gr.Textbox(lines=10, placeholder="Enter a news headline or article to classify..."), outputs="text", # Output will be displayed as text title="Fake News Detection", description="Enter a news headline or article and see whether the model classifies it as 'Fake News' or 'True News'.", ) # Launch the interface iface.launch(share=True)