|
|
|
import gradio as gr |
|
import matplotlib |
|
import numpy as np |
|
import pandas as pd |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
|
|
TASK1_COLS = [ |
|
("Team", "str"), |
|
("Email", "str"), |
|
("Acc", "number"), |
|
("F1", "number"), |
|
("MCC", "number"), |
|
("DLT", "number"), |
|
] |
|
|
|
TASK2_COLS = [ |
|
("Team", "str"), |
|
("Email", "str"), |
|
("Rouge-1", "number"), |
|
("Rouge-2", "number"), |
|
("Rouge-L", "number"), |
|
("BertScore", "number"), |
|
("BartScore", "number"), |
|
("DLT", "number"), |
|
] |
|
|
|
TASK3_COLS = [ |
|
("Team", "str"), |
|
("Email", "str"), |
|
("Sharpe Ratio", "number"), |
|
("Sharpe Ratio - DRIV", "number"), |
|
("Sharpe Ratio - FORM", "number"), |
|
("Sharpe Ratio - JNJ", "number"), |
|
("Sharpe Ratio - MSFT", "number"), |
|
] |
|
|
|
|
|
|
|
task1_cols = [col_name for col_name, _ in TASK1_COLS] |
|
task2_cols = [col_name for col_name, _ in TASK2_COLS] |
|
task3_cols = [col_name for col_name, _ in TASK3_COLS] |
|
|
|
|
|
def create_df_dict(lang, lang_cols): |
|
|
|
leaderboard_df = pd.read_csv(f"{lang}_result.csv", names=lang_cols) |
|
|
|
|
|
leaderboard_df = leaderboard_df[["Team"] + [col for col in leaderboard_df.columns if col != "Team"]] |
|
cols = leaderboard_df.columns |
|
types = ["str"] + ["number"] * (len(lang_cols) - 1) |
|
|
|
|
|
df_dict = {"overall": leaderboard_df} |
|
return df_dict |
|
|
|
|
|
df_lang = { |
|
"Task 1": create_df_dict("task1", task1_cols), |
|
"Task 2": create_df_dict("task2", task2_cols), |
|
"Task 3": create_df_dict("task3", task3_cols), |
|
} |
|
|
|
|
|
|
|
TITLE = '<h1 align="center" id="space-title">π² IJCAI 2024 FinLLM Challenge Leaderboard</h1>' |
|
INTRODUCTION_TEXT = """π Introduction |
|
|
|
The FinLLM Challenge rigorously evaluates state-of-the-art models in financial text analysis, generation, and decision-making tasks. These tasks include financial classification, financial text summarization, and single stock trading. |
|
|
|
π Unique Evaluation Metrics |
|
|
|
Our leaderboard incorporates a comprehensive evaluation using diverse metrics like Accuracy, F1 Score, ROUGE, BERTScore, and Sharpe Ratio to assess the models' capabilities in real-world financial applications. |
|
|
|
π Task Details |
|
|
|
**Task 1: Financial Classification** |
|
|
|
- **Objective:** Classify sentences as claims or premises. |
|
- **Dataset:** 7.75k training data, 969 test data. |
|
- **Evaluation Metrics:** F1 Score (final ranking metric) and Accuracy. |
|
|
|
**Task 2: Financial Text Summarization** |
|
|
|
- **Objective:** Summarize financial news articles into concise texts. |
|
- **Dataset:** 8k training data, 2k test data. |
|
- **Evaluation Metrics:** ROUGE (1, 2, L) and BERTScore (ROUGE-1 as the final ranking metric). |
|
|
|
**Task 3: Single Stock Trading** |
|
|
|
- **Objective:** Make stock trading decisions (buy, sell, hold) with reasonings. |
|
- **Dataset:** 291 data points. |
|
- **Evaluation Metrics:** Sharpe Ratio (final ranking metric), Cumulative Return, Daily and Annualized Volatility, Maximum Drawdown. |
|
|
|
**Model Cheating Detection: Data Leakage Test (DLT)** |
|
|
|
To measure the risk of data leakage from the test set used in training, we introduce the Data Leakage Test (DLT). The DLT calculates the difference in perplexity between the training set and the test set. A larger difference indicates a lower likelihood of model cheating, while a smaller difference suggests a higher likelihood. |
|
|
|
For more details, refer to our [Challenge page](https://sites.google.com/nlg.csie.ntu.edu.tw/finnlp-agentscen/shared-task-finllm?authuser=0). |
|
|
|
**Task 1: Top 3** |
|
π₯ Team Barclays |
|
π₯ Albatross |
|
π₯ L3iTC |
|
|
|
**Task 2: Top 3** |
|
π₯ LBZ |
|
π₯ Upaya |
|
π₯ Finance Wizard |
|
|
|
**Task 3: Top 3** |
|
π₯ Wealth Guide |
|
π₯ Upaya |
|
π₯ Albatross |
|
|
|
""" |
|
|
|
|
|
def create_data_interface(df): |
|
headers = df.columns |
|
print (headers) |
|
types = ["str"] + ["number"] * (len(headers) - 1) |
|
|
|
return gr.components.Dataframe( |
|
value=df.values.tolist(), |
|
headers=[col_name for col_name in headers], |
|
datatype=types, |
|
) |
|
|
|
|
|
def plot_radar_chart(df, attributes, category_name): |
|
fig = go.Figure() |
|
|
|
for index, row in df.iterrows(): |
|
model = row["Model"] |
|
values = row[attributes].tolist() |
|
fig.add_trace(go.Scatterpolar(r=values, theta=attributes, fill="toself", name=model)) |
|
|
|
fig.update_layout(title="FLARE", polar=dict(radialaxis=dict(visible=True, range=[0, 0.9])), showlegend=True) |
|
|
|
return fig |
|
|
|
|
|
def create_data_interface_for_aggregated(df, category_name): |
|
attributes = df.columns[1:] |
|
print(attributes) |
|
plt = plot_radar_chart(df, attributes, category_name) |
|
return plt |
|
|
|
|
|
def create_lang_leaderboard(df_dict): |
|
for key, df in df_dict.items(): |
|
with gr.Tab(key): |
|
create_data_interface(df) |
|
|
|
|
|
def launch_gradio(): |
|
demo = gr.Blocks() |
|
|
|
with demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
for key, df_dict in df_lang.items(): |
|
with gr.Tab(key): |
|
create_lang_leaderboard(df_dict) |
|
|
|
demo.launch() |
|
|
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.add_job(launch_gradio, "interval", seconds=3600) |
|
scheduler.start() |
|
|
|
|
|
launch_gradio() |
|
|