import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from tqdm.auto import tqdm

# Constants
batch_size = 1000

# Load tokenizer and model
model_checkpoint = "PleIAs/French-TV-Headline-Classification"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=512)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Read the dataset
val_classification = pd.read_parquet("[file]")
val_classification.reset_index(drop=True, inplace=True)

# Calculate the number of batches needed
num_batches = (len(val_classification) + batch_size - 1) // batch_size

# Initialize the list to collect DataFrames
list_df = []

for i in tqdm(range(num_batches), desc="Processing batches"):
    start_index = i * batch_size
    end_index = min((i + 1) * batch_size, len(val_classification))
    batch = val_classification.iloc[start_index:end_index]

    # Extract texts from the DataFrame
    texts = batch["corrected_text"].tolist()

    # Classify texts in batches
    classifications = classification_pipeline(texts, truncation=True, padding=True, top_k=None)

    # Prepare data for DataFrame
    rows = []
    for text_index, class_results in enumerate(classifications):
        for entry in class_results:
            rows.append({
                'text_id': start_index + text_index,
                'label': entry['label'],
                'score': round(entry['score'] * 100, 2),
                'identifier': batch.iloc[text_index]['identifier']
            })

    # Create DataFrame from the processed batch
    df = pd.DataFrame(rows)
    list_df.append(df)


# Concatenate all DataFrames in the list
final_df = pd.concat(list_df, ignore_index=True)

print(final_df)

# Save the resulting DataFrame to a CSV file
final_df.to_csv("transcript_classification.csv", index=False)