import pandas as pd from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline from tqdm.auto import tqdm # Constants batch_size = 1000 # Load tokenizer and model model_checkpoint = "PleIAs/French-TV-Headline-Classification" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=512) model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer) # Read the dataset val_classification = pd.read_parquet("[file]") val_classification.reset_index(drop=True, inplace=True) # Calculate the number of batches needed num_batches = (len(val_classification) + batch_size - 1) // batch_size # Initialize the list to collect DataFrames list_df = [] for i in tqdm(range(num_batches), desc="Processing batches"): start_index = i * batch_size end_index = min((i + 1) * batch_size, len(val_classification)) batch = val_classification.iloc[start_index:end_index] # Extract texts from the DataFrame texts = batch["corrected_text"].tolist() # Classify texts in batches classifications = classification_pipeline(texts, truncation=True, padding=True, top_k=None) # Prepare data for DataFrame rows = [] for text_index, class_results in enumerate(classifications): for entry in class_results: rows.append({ 'text_id': start_index + text_index, 'label': entry['label'], 'score': round(entry['score'] * 100, 2), 'identifier': batch.iloc[text_index]['identifier'] }) # Create DataFrame from the processed batch df = pd.DataFrame(rows) list_df.append(df) # Concatenate all DataFrames in the list final_df = pd.concat(list_df, ignore_index=True) print(final_df) # Save the resulting DataFrame to a CSV file final_df.to_csv("transcript_classification.csv", index=False)