File size: 2,531 Bytes
746b99c
 
 
 
 
 
 
 
 
 
 
 
 
 
1a0e502
746b99c
 
 
 
 
 
1a0e502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
746b99c
1a0e502
 
 
746b99c
1a0e502
746b99c
 
 
 
 
 
 
 
1a0e502
746b99c
 
 
 
 
 
1a0e502
746b99c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pandas as pd
import chromadb
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, pipeline
import gradio as gr
import email

# loading and preprocessing dataset
emails = pd.read_csv('emails.csv')
def preprocess_email_content(raw_email):
    message = email.message_from_string(raw_email).get_payload()
    return message.replace("\n", "").replace("\r", "").replace("> >>> > >", "").strip()

content_text = [preprocess_email_content(item) for item in emails['message']]
train_content, _ = train_test_split(content_text, train_size=0.00005) # was unable to load more emails

# ChromaDB setup
client = chromadb.Client()
collection = client.create_collection(name="Enron_emails")
collection.add(documents=train_content, ids=[f'id{i+1}' for i in range(len(train_content))])

# model and tokenizer 
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# tokenizing and training
tokenized_emails = tokenizer(train_content, truncation=True, padding=True)
with open('tokenized_emails.txt', 'w') as file:
    for ids in tokenized_emails['input_ids']:
        file.write(' '.join(map(str, ids)) + '\n')

dataset = TextDataset(tokenizer=tokenizer, file_path='tokenized_emails.txt', block_size=128)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = TrainingArguments(
    output_dir='./output',
    num_train_epochs=3,
    per_device_train_batch_size=8
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)
trainer.train()

# saving the model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

# Gradio interface
def question_answer(question):
    try:
        generated = text_gen(question, max_length=200, num_return_sequences=1)
        generated_text = generated[0]['generated_text'].replace(question, "").strip()
        return generated_text
    except Exception as e:
        return f"Error in generating response: {str(e)}"

text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer)
iface = gr.Interface(
    fn=question_answer,
    inputs="text",
    outputs="text",
    title="Answering questions about the Enron case.",
    description="Ask a question about the Enron case!",
    examples=["What is Eron?"]
)
iface.launch()