Spaces:

acmc
/

whatsapp-chats-finetuning-formatter

Running

App Files Files Community

ACMC commited on Sep 27

Commit

a8dfddd

•

1 Parent(s): f1fc3d0

Bugfix

Browse files

Files changed (2) hide show

app.py +94 -21
utils.py +136 -60

app.py CHANGED Viewed

@@ -8,15 +8,26 @@ import datasets
 import gradio as gr
 import matplotlib.pyplot as plt
-from utils import (process_chat_file,
-                   transform_conversations_dataset_into_training_examples)
 from validation import check_format_errors, estimate_cost, get_distributions
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
-def convert_to_dataset(files, do_spelling_correction, progress, whatsapp_name, datetime_dayfirst, message_line_format):
     modified_dataset = None
     for file in progress.tqdm(files, desc="Processing files"):
         try:
@@ -28,6 +39,8 @@ def convert_to_dataset(files, do_spelling_correction, progress, whatsapp_name, d
                     whatsapp_name=whatsapp_name,
                     datetime_dayfirst=datetime_dayfirst,
                     message_line_format=message_line_format,
                 )
             else:
                 # Concatenate the datasets
@@ -37,6 +50,8 @@ def convert_to_dataset(files, do_spelling_correction, progress, whatsapp_name, d
                     whatsapp_name=whatsapp_name,
                     datetime_dayfirst=datetime_dayfirst,
                     message_line_format=message_line_format,
                 )
                 modified_dataset = datasets.concatenate_datasets(
                     [modified_dataset, this_file_dataset]
@@ -57,6 +72,10 @@ def file_upload_callback(
     whatsapp_name,
     datetime_dayfirst,
     message_line_format,
     progress=gr.Progress(),
 ):
     logger.info(f"Processing {files}")
@@ -73,7 +92,7 @@ The {model_role} and the {user_role} can send multiple messages in a row, as a J
     # Check if the user has not chosen any files
     if not files or len(files) == 0:
         raise gr.Error("Please upload at least one file.")
     # Check if the user has not entered their whatsapp name
     if not whatsapp_name or len(whatsapp_name) == 0:
         raise gr.Error("Please enter your WhatsApp name.")
@@ -87,26 +106,43 @@ The {model_role} and the {user_role} can send multiple messages in a row, as a J
         whatsapp_name=whatsapp_name,
         datetime_dayfirst=datetime_dayfirst,
         message_line_format=message_line_format,
     )
-    logger.info(f"Number of conversations of dataset before being transformed: {len(dataset)}")
-    training_examples_ds = transform_conversations_dataset_into_training_examples(
         conversations_ds=dataset,
         system_prompt=full_system_prompt,
         user_role=user_role,
         model_role=model_role,
         whatsapp_name=whatsapp_name,
     )
-    logger.info(f"Number of training examples: {len(training_examples_ds)}")
     # Split into training and validation datasets (80% and 20%)
-    training_examples_ds = training_examples_ds.train_test_split(
-        test_size=validation_split, seed=42
-    )
-    training_examples_ds, validation_examples_ds = (
-        training_examples_ds["train"],
-        training_examples_ds["test"],
-    )
     training_examples_ds = training_examples_ds  # .select(
     #    range(min(250, len(training_examples_ds)))
     # )
@@ -125,9 +161,12 @@ The {model_role} and the {user_role} can send multiple messages in a row, as a J
     )
     stats = {
-        "Format Errors": format_errors,
         "Number of examples missing system message": distributions["n_missing_system"],
         "Number of examples missing user message": distributions["n_missing_user"],
         "Cost Statistics": cost_stats,
     }
@@ -156,9 +195,9 @@ The {model_role} and the {user_role} can send multiple messages in a row, as a J
     # If there's less than 50 training examples, show a warning message
     if len(training_examples_ds) < 50:
         gr.Warning(
-            "Warning: There are less than 50 training examples. The model may not perform well with such a small dataset. Consider adding more chat files to increase the number of training examples."
         )
     system_prompt_to_use = full_system_prompt
     return (
@@ -245,14 +284,38 @@ with gr.Blocks(theme=theme) as demo:
         model_role = gr.Textbox(
             label="Role for Model",
-            info="This is a technical parameter. Usual values are 'model' or 'assistant'.",
             value="model",
         )
         message_line_format = gr.Textbox(
             label="Message Line Format",
             info="Format of each message line in the chat file, as a regular expression. The default value should work for most cases.",
-            value=r"\[?(?P<msg_datetime>\S+,\s\S+?(?:\s[APap][Mm])?)\]? (?:- )?(?P<contact_name>.+): (?P<message>.+)",
         )
         datetime_dayfirst = gr.Checkbox(
@@ -287,7 +350,13 @@ with gr.Blocks(theme=theme) as demo:
         variant="secondary",
     )
-    system_prompt_to_use = gr.Textbox(label="System Prompt that you can use", visible=False, interactive=False, show_copy_button=True, info="When using the model, if you're asked for a system prompt, you can use this text.")
     # output_example = gr.JSON(label="Example Training Example")
     with gr.Group():
@@ -316,6 +385,10 @@ with gr.Blocks(theme=theme) as demo:
             whatsapp_name,
             datetime_dayfirst,
             message_line_format,
         ],
         outputs=[
             output_file,
@@ -327,7 +400,7 @@ with gr.Blocks(theme=theme) as demo:
             num_total_tokens_per_example_plot,
             num_assistant_tokens_per_example_plot,
             system_prompt_to_use,
-            system_prompt_to_use
         ],
     )

 import gradio as gr
 import matplotlib.pyplot as plt
+from utils import (
+    process_chat_file,
+    transform_conversations_dataset_into_training_examples,
+)
 from validation import check_format_errors, estimate_cost, get_distributions
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
+def convert_to_dataset(
+    files,
+    do_spelling_correction,
+    progress,
+    whatsapp_name,
+    datetime_dayfirst,
+    message_line_format,
+    minutes_threshold,
+    min_messages_per_conversation,
+):
     modified_dataset = None
     for file in progress.tqdm(files, desc="Processing files"):
         try:
                     whatsapp_name=whatsapp_name,
                     datetime_dayfirst=datetime_dayfirst,
                     message_line_format=message_line_format,
+                    minutes_threshold=minutes_threshold,
+                    min_messages_per_conversation=min_messages_per_conversation,
                 )
             else:
                 # Concatenate the datasets
                     whatsapp_name=whatsapp_name,
                     datetime_dayfirst=datetime_dayfirst,
                     message_line_format=message_line_format,
+                    minutes_threshold=minutes_threshold,
+                    min_messages_per_conversation=min_messages_per_conversation,
                 )
                 modified_dataset = datasets.concatenate_datasets(
                     [modified_dataset, this_file_dataset]
     whatsapp_name,
     datetime_dayfirst,
     message_line_format,
+    minutes_threshold,
+    min_messages_per_conversation,
+    max_characters_per_message,
+    split_conversation_threshold,
     progress=gr.Progress(),
 ):
     logger.info(f"Processing {files}")
     # Check if the user has not chosen any files
     if not files or len(files) == 0:
         raise gr.Error("Please upload at least one file.")
     # Check if the user has not entered their whatsapp name
     if not whatsapp_name or len(whatsapp_name) == 0:
         raise gr.Error("Please enter your WhatsApp name.")
         whatsapp_name=whatsapp_name,
         datetime_dayfirst=datetime_dayfirst,
         message_line_format=message_line_format,
+        minutes_threshold=minutes_threshold,
+        min_messages_per_conversation=min_messages_per_conversation,
+    )
+    logger.info(
+        f"Number of conversations of dataset before being transformed: {len(dataset)}"
     )
+    full_examples_ds = transform_conversations_dataset_into_training_examples(
         conversations_ds=dataset,
         system_prompt=full_system_prompt,
         user_role=user_role,
         model_role=model_role,
         whatsapp_name=whatsapp_name,
+        minutes_threshold=minutes_threshold,
+        min_messages_per_conversation=min_messages_per_conversation,
+        split_conversation_threshold=split_conversation_threshold,
+        max_characters_per_message=max_characters_per_message,
+    )
+    total_number_of_generated_examples = len(full_examples_ds)
+    logger.info(
+        f"Total number of generated examples: {total_number_of_generated_examples}"
     )
     # Split into training and validation datasets (80% and 20%)
+    try:
+        split_examples_ds = full_examples_ds.train_test_split(
+            test_size=validation_split, seed=42
+        )
+        training_examples_ds, validation_examples_ds = (
+            split_examples_ds["train"],
+            split_examples_ds["test"],
+        )
+    except ValueError as e:
+        # This happens when there's not enough data to split into training and validation datasets
+        # In this case, we'll just use the whole dataset for training, nothing for validation
+        training_examples_ds = full_examples_ds
+        validation_examples_ds = datasets.Dataset.from_dict({})
     training_examples_ds = training_examples_ds  # .select(
     #    range(min(250, len(training_examples_ds)))
     # )
     )
     stats = {
+        "Total number of training examples": total_number_of_generated_examples,
+        "Number of training examples": len(training_examples_ds),
+        "Number of validation examples": len(validation_examples_ds),
         "Number of examples missing system message": distributions["n_missing_system"],
         "Number of examples missing user message": distributions["n_missing_user"],
+        "Format Errors": format_errors,
         "Cost Statistics": cost_stats,
     }
     # If there's less than 50 training examples, show a warning message
     if len(training_examples_ds) < 50:
         gr.Warning(
+            "There are less than 50 training examples. The model may not perform well with such a small dataset. Consider adding more chat files to increase the number of training examples."
         )
     system_prompt_to_use = full_system_prompt
     return (
         model_role = gr.Textbox(
             label="Role for Model",
+            info="This is a technical parameter. Usual values are 'model' (e.g. Vertex AI) or 'assistant' (e.g. OpenAI).",
             value="model",
         )
+        minutes_threshold = gr.Number(
+            label="Minutes Threshold",
+            info="Threshold in minutes to consider that a new message is a new conversation. The default value should work for most cases.",
+            value=180,
+        )
+        min_messages_per_conversation = gr.Number(
+            label="Minimum Messages per Conversation",
+            info="Minimum number of messages per conversation to consider it as a valid conversation. The default value should work for most cases.",
+            value=5,
+        )
+        max_characters_per_message = gr.Number(
+            label="Max Characters per Message",
+            info="One token is around 3 characters. The default value should work for most cases. For example, on Vertex AI, the maximum number of tokens per example is [32,000](https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-supervised-tuning-prepare#sample-datasets), so keeping the default value will ensure that the examples are well within the limit.",
+            value=10000,
+        )
+        split_conversation_threshold = gr.Number(
+            label="Split Conversation Threshold",
+            info="Number of messages in a conversation to split it into multiple ones. The default value should work for most cases.",
+            value=40,
+        )
         message_line_format = gr.Textbox(
             label="Message Line Format",
             info="Format of each message line in the chat file, as a regular expression. The default value should work for most cases.",
+            value=r"\[?(?P<msg_datetime>\S+,\s\S+?(?:\s[APap][Mm])?)\]? (?:- )?(?P<contact_name>.+?): (?P<message>.+)",
         )
         datetime_dayfirst = gr.Checkbox(
         variant="secondary",
     )
+    system_prompt_to_use = gr.Textbox(
+        label="System Prompt that you can use",
+        visible=False,
+        interactive=False,
+        show_copy_button=True,
+        info="When using the model, if you're asked for a system prompt, you can use this text.",
+    )
     # output_example = gr.JSON(label="Example Training Example")
     with gr.Group():
             whatsapp_name,
             datetime_dayfirst,
             message_line_format,
+            minutes_threshold,
+            min_messages_per_conversation,
+            max_characters_per_message,
+            split_conversation_threshold,
         ],
         outputs=[
             output_file,
             num_total_tokens_per_example_plot,
             num_assistant_tokens_per_example_plot,
             system_prompt_to_use,
+            system_prompt_to_use,
         ],
     )

utils.py CHANGED Viewed

@@ -9,22 +9,25 @@ import dateutil.parser
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
-# %%
-# Now, create message groups ('conversations')
-# The idea is to group messages that are close in time
-# We'll use a 180 minute threshold
-MINUTES_THRESHOLD = 180
-MIN_MESSAGES_THRESHOLD = 5
-def group_messages(messages_iterable):
     groups = []
-    current_group = [next(messages_iterable)]
     for message in messages_iterable:
         assert len(current_group) > 0  # We should never have an empty group
         if (
             message["timestamp"] - current_group[-1]["timestamp"]
-            < MINUTES_THRESHOLD * 60
         ):
             current_group.append(message)
         else:
@@ -214,7 +217,16 @@ import os
 # %%
-def process_chat_file(file, do_spelling_correction, whatsapp_name, datetime_dayfirst, message_line_format, do_reordering=False):
     """
     Process a chat file and return a dataset with the conversations.
     """
@@ -224,50 +236,83 @@ def process_chat_file(file, do_spelling_correction, whatsapp_name, datetime_dayf
         message_line_format
     )
-    def process_line(example):
         # The lines have this format: dd/mm/yy, hh:mm - <person>: <msg>
-        try:
-            groups = exp.match(example["text"]).groupdict()
-            timestamp = dateutil.parser.parse(groups['msg_datetime'], dayfirst=datetime_dayfirst).timestamp()
-            return {
-                "message": groups["message"],
-                "contact_name": groups["contact_name"],
-                "timestamp": timestamp,
-            }
-        except Exception as e:
-            logger.exception(example["text"])
-            raise e
     try:
         ds = datasets.load_dataset("text", data_files=[file])["train"]
     except Exception as e:
         logger.exception(f"Error while loading file {file}")
         raise Exception(f"Error while loading file {file}") from e
     try:
-        ds = ds.filter(
-            # Has to begin by date, time, contact name, and contain at least a ':' symbol
-            lambda x: re.match(
-                r"^\d{1,2}/\d{1,2}/\d{1,2},\s\d{2}:\d{2}\s-\s.+:", x["text"]
-            )
-        )
-    except Exception as e:
-        logger.exception(f"Error filtering the lines in file {file} so they match the expected format")
-        raise Exception(f"Error filtering the lines in file {file} so they match the expected format") from e
-    try:
-        ds = ds.map(process_line, remove_columns=["text"])
     except Exception as e:
-        logger.exception(f"Error mapping the lines in file {file} to the expected format")
-        raise Exception(f"Error mapping the lines in file {file} to the expected format") from e
     try:
         # Filter out messages that just say '<Media omitted>'
         ds = ds.filter(lambda x: x["message"] != "<Media omitted>")
     except Exception as e:
-        logger.exception(f"Error filtering out messages that say '<Media omitted>' in file {file}")
-        raise Exception(f"Error filtering out messages that say '<Media omitted>' in file {file}") from e
     try:
-        groups = group_messages(iter(ds))
         # Generate the dataset
         conversations_ds = datasets.Dataset.from_dict({"conversations": groups})
     except Exception as e:
@@ -277,11 +322,15 @@ def process_chat_file(file, do_spelling_correction, whatsapp_name, datetime_dayf
     try:
         # Filter out conversations with less than 5 messages
         conversations_ds = conversations_ds.filter(
-            lambda x: len(x["conversations"]) >= MIN_MESSAGES_THRESHOLD
         )
     except Exception as e:
-        logger.exception(f"Error filtering out conversations with less than {MIN_MESSAGES_THRESHOLD} messages in file {file}")
-        raise Exception(f"Error filtering out conversations with less than {MIN_MESSAGES_THRESHOLD} messages in file {file}") from e
     try:
         conversations_ds_without_whatsapp_annotations = conversations_ds.map(
@@ -295,11 +344,15 @@ def process_chat_file(file, do_spelling_correction, whatsapp_name, datetime_dayf
     if do_spelling_correction:
         try:
             spell_checked_conversations_ds = (
-                conversations_ds_without_whatsapp_annotations.map(spell_check_conversation)
             )
         except Exception as e:
             logger.exception(f"Error spell checking the conversations in file {file}")
-            raise Exception(f"Error spell checking the conversations in file {file}") from e
     else:
         spell_checked_conversations_ds = conversations_ds_without_whatsapp_annotations
@@ -327,7 +380,9 @@ def process_chat_file(file, do_spelling_correction, whatsapp_name, datetime_dayf
         )  # , num_proc=os.cpu_count() - 1)
     except Exception as e:
         logger.exception(f"Error changing your other contact's names in file {file}")
-        raise Exception(f"Error changing your other contact's names in file {file}") from e
     try:
         # Filter out conversations with only one contact
@@ -335,18 +390,26 @@ def process_chat_file(file, do_spelling_correction, whatsapp_name, datetime_dayf
             lambda x: len(set([msg["contact_name"] for msg in x["conversations"]])) > 1
         )
     except Exception as e:
-        logger.exception(f"Error filtering out conversations with only one contact in file {file}")
-        raise Exception(f"Error filtering out conversations with only one contact in file {file}") from e
     return changed_contact_name_ds
-SPLIT_CONVERSATION_THRESHOLD = 40
-MAX_CHARACTERS_PER_MESSAGE = 10000  # Max is 8,192 tokens (https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-supervised-tuning-about#sample-datasets)
 def transform_conversations_dataset_into_training_examples(
-    conversations_ds, system_prompt, user_role, model_role, whatsapp_name
 ):
     """
     Takes in a dataset with conversations and returns a dataset with training examples.
@@ -376,7 +439,7 @@ def transform_conversations_dataset_into_training_examples(
                     model_role if msg["contact_name"] == whatsapp_name else user_role
                 )
                 if (
-                    counter > SPLIT_CONVERSATION_THRESHOLD
                     and converted_role == user_role
                 ):
                     processed_examples.append(
@@ -401,7 +464,7 @@ def transform_conversations_dataset_into_training_examples(
                         {"role": converted_role, "content": [msg["message"]]}
                     )
                 counter += 1
-            if len(messages) >= MIN_MESSAGES_THRESHOLD:
                 processed_examples.append(
                     {
                         "messages": [
@@ -415,8 +478,13 @@ def transform_conversations_dataset_into_training_examples(
                 )
             else:
                 logger.warning(
-                    f"Discarding conversation because the length is not at least {MIN_MESSAGES_THRESHOLD}: {messages}"
                 )
         # Before returning, flatten the list of dictionaries into a dictionary of lists
         flattened_examples = {}
         for key in processed_examples[0].keys():
@@ -431,17 +499,25 @@ def transform_conversations_dataset_into_training_examples(
             batched=True,
         )
     except Exception as e:
-        logger.exception("Error transforming the conversations dataset into training examples")
-        raise Exception("Error transforming the conversations dataset into training examples") from e
     try:
         examples_filtered_by_length = processed_examples.filter(
             lambda x: all(
-                [len(m["content"]) < MAX_CHARACTERS_PER_MESSAGE for m in x["messages"]]
             )
         )
     except Exception as e:
-        logger.exception("Error filtering out examples with messages longer than the maximum allowed")
-        raise Exception("Error filtering out examples with messages longer than the maximum allowed") from e
     return examples_filtered_by_length

 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
+# %%
+def group_messages(messages_iterable, minutes_threshold):
+    """
+    Groups messages in a conversation. If the difference between two consecutive messages is less than `minutes_threshold` minutes, they are grouped together.
+    """
     groups = []
+    current_group = []
+    try:
+        first_message = next(messages_iterable)
+        current_group.append(first_message)
+    except StopIteration:
+        logger.exception("No messages in the conversation")
+        return []
     for message in messages_iterable:
         assert len(current_group) > 0  # We should never have an empty group
         if (
             message["timestamp"] - current_group[-1]["timestamp"]
+            < minutes_threshold * 60
         ):
             current_group.append(message)
         else:
 # %%
+def process_chat_file(
+    file,
+    do_spelling_correction,
+    whatsapp_name,
+    datetime_dayfirst,
+    message_line_format,
+    minutes_threshold,
+    min_messages_per_conversation,
+    do_reordering=False,
+):
     """
     Process a chat file and return a dataset with the conversations.
     """
         message_line_format
     )
+    def process_line(examples):
         # The lines have this format: dd/mm/yy, hh:mm - <person>: <msg>
+        messages = []
+        contact_names = []
+        timestamps = []
+        for line_text in examples["text"]:
+            try:
+                groups = exp.match(line_text).groupdict()
+                # First, get the elements. If something fails here, it will raise an exception before actually adding the element to the list, so we'll be sure that the three lists contain the same # of elements.
+                timestamp = dateutil.parser.parse(
+                    groups["msg_datetime"], dayfirst=datetime_dayfirst
+                ).timestamp()
+                message = groups["message"]
+                contact_name = groups["contact_name"]
+                messages.append(message)
+                contact_names.append(contact_name)
+                timestamps.append(timestamp)
+            except Exception as e:
+                logger.exception(f"Error while processing line {line_text}")
+        return {
+            "message": messages,
+            "contact_name": contact_names,
+            "timestamp": timestamps,
+        }
     try:
         ds = datasets.load_dataset("text", data_files=[file])["train"]
     except Exception as e:
         logger.exception(f"Error while loading file {file}")
         raise Exception(f"Error while loading file {file}") from e
+    # try:
+    #     ds = ds.filter(
+    #         # Has to begin by date, time, contact name, and contain at least a ':' symbol
+    #         lambda x: re.match(
+    #             r"^\d{1,2}/\d{1,2}/\d{1,4},\s\d{2}:\d{2}\s-\s.+:", x["text"]
+    #         )
+    #     )
+    # except Exception as e:
+    #     logger.exception(f"Error filtering the lines in file {file} so they match the expected format")
+    #     raise Exception(f"Error filtering the lines in file {file} so they match the expected format") from e
     try:
+        ds = ds.map(process_line, remove_columns=["text"], batched=True, batch_size=10)
     except Exception as e:
+        logger.exception(
+            f"Error mapping the lines in file {file} to the expected format"
+        )
+        raise Exception(
+            f"Error mapping the lines in file {file} to the expected format"
+        ) from e
+    # Check that the WhatsApp name is in at least one of the messages. If it's not, raise an exception
+    set_of_contact_names = ds.unique("contact_name")
+    if whatsapp_name not in set_of_contact_names:
+        raise Exception(
+            f"Your WhatsApp name ({whatsapp_name}) is not in the messages of at least one uploaded file. Please check that you wrote your name correctly. These were the participants found: {set_of_contact_names}"
+        )
+    # # Also check that the number of contact names is == 2 (i.e. we don't have group chats)
+    # if len(set_of_contact_names) > 2:
+    #     raise Exception(
+    #         f"There were more than 2 participants in at least one uploaded file. Please check that you're not using group chats. These were the participants found: {set_of_contact_names}"
+    #     )
     try:
         # Filter out messages that just say '<Media omitted>'
         ds = ds.filter(lambda x: x["message"] != "<Media omitted>")
     except Exception as e:
+        logger.exception(
+            f"Error filtering out messages that say '<Media omitted>' in file {file}"
+        )
+        raise Exception(
+            f"Error filtering out messages that say '<Media omitted>' in file {file}"
+        ) from e
     try:
+        groups = group_messages(iter(ds), minutes_threshold=minutes_threshold)
         # Generate the dataset
         conversations_ds = datasets.Dataset.from_dict({"conversations": groups})
     except Exception as e:
     try:
         # Filter out conversations with less than 5 messages
         conversations_ds = conversations_ds.filter(
+            lambda x: len(x["conversations"]) >= min_messages_per_conversation
         )
     except Exception as e:
+        logger.exception(
+            f"Error filtering out conversations with less than {min_messages_per_conversation} messages in file {file}"
+        )
+        raise Exception(
+            f"Error filtering out conversations with less than {min_messages_per_conversation} messages in file {file}"
+        ) from e
     try:
         conversations_ds_without_whatsapp_annotations = conversations_ds.map(
     if do_spelling_correction:
         try:
             spell_checked_conversations_ds = (
+                conversations_ds_without_whatsapp_annotations.map(
+                    spell_check_conversation
+                )
             )
         except Exception as e:
             logger.exception(f"Error spell checking the conversations in file {file}")
+            raise Exception(
+                f"Error spell checking the conversations in file {file}"
+            ) from e
     else:
         spell_checked_conversations_ds = conversations_ds_without_whatsapp_annotations
         )  # , num_proc=os.cpu_count() - 1)
     except Exception as e:
         logger.exception(f"Error changing your other contact's names in file {file}")
+        raise Exception(
+            f"Error changing your other contact's names in file {file}"
+        ) from e
     try:
         # Filter out conversations with only one contact
             lambda x: len(set([msg["contact_name"] for msg in x["conversations"]])) > 1
         )
     except Exception as e:
+        logger.exception(
+            f"Error filtering out conversations with only one contact in file {file}"
+        )
+        raise Exception(
+            f"Error filtering out conversations with only one contact in file {file}"
+        ) from e
     return changed_contact_name_ds
 def transform_conversations_dataset_into_training_examples(
+    conversations_ds,
+    system_prompt,
+    user_role,
+    model_role,
+    whatsapp_name,
+    minutes_threshold,
+    min_messages_per_conversation,
+    split_conversation_threshold,
+    max_characters_per_message,
 ):
     """
     Takes in a dataset with conversations and returns a dataset with training examples.
                     model_role if msg["contact_name"] == whatsapp_name else user_role
                 )
                 if (
+                    counter > split_conversation_threshold
                     and converted_role == user_role
                 ):
                     processed_examples.append(
                         {"role": converted_role, "content": [msg["message"]]}
                     )
                 counter += 1
+            if len(messages) >= min_messages_per_conversation:
                 processed_examples.append(
                     {
                         "messages": [
                 )
             else:
                 logger.warning(
+                    f"Discarding conversation because the length is not at least {min_messages_per_conversation}: {messages}"
                 )
+        if len(processed_examples) == 0:
+            logger.warning(
+                f"Discarding all conversations because none of them have at least {min_messages_per_conversation} messages"
+            )
+            return {}
         # Before returning, flatten the list of dictionaries into a dictionary of lists
         flattened_examples = {}
         for key in processed_examples[0].keys():
             batched=True,
         )
     except Exception as e:
+        logger.exception(
+            "Error transforming the conversations dataset into training examples"
+        )
+        raise Exception(
+            "Error transforming the conversations dataset into training examples"
+        ) from e
     try:
         examples_filtered_by_length = processed_examples.filter(
             lambda x: all(
+                [len(m["content"]) < max_characters_per_message for m in x["messages"]]
             )
         )
     except Exception as e:
+        logger.exception(
+            "Error filtering out examples with messages longer than the maximum allowed"
+        )
+        raise Exception(
+            "Error filtering out examples with messages longer than the maximum allowed"
+        ) from e
     return examples_filtered_by_length