FinBert Model Token Size
How I can derive one score for each transcript using Pycharm:
Can you please correct my codes:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)
Set the base path for the transcript files
basepath = 'D:/ANALYSIS/DATABASE/All'
output_directory = 'C:/Users/Desktop/'
os.makedirs(output_directory, exist_ok=True)
Open the CSV file for writing and write the headers
with open(os.path.join(output_directory, 'FinBert_Sentiments.csv'), 'w', encoding='utf-8', newline='') as content:
writer = csv.writer(content)
writer.writerow(("Firm Name", "Label", "Score"))
# Loop through all files in the base path
for root, dirs, files in os.walk(basepath):
for file in files:
if file.endswith('.txt'):
# Extract firm name from transcript text
with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
transcript = f.read().lower()
match = re.search(r'q\d\s\d{4}\s(.+)', transcript)
firm_name = match.group(1) if match else ''
# Split the transcript into chunks of 512 tokens
max_chunk_size = 512
chunks = [transcript[i:i + max_chunk_size] for i in range(0, len(transcript), max_chunk_size)]
for chunk in chunks:
# Tokenize and truncate/split input text to fit the model's maximum sequence length
tokens = tokenizer.encode_plus(chunk, max_length=1300000000, truncation=True, return_tensors='pt')
input_ids = tokens['input_ids']
# Perform sentiment analysis on the transcript
results = nlp(tokenizer.decode(input_ids[0], skip_special_tokens=True))
label = results[0]['label']
score = results[0]['score']
# Write the results to the CSV file
writer.writerow([firm_name, label, score])