isaiahbjork
/

llama-3.1-8b-logic

@@ -16,6 +16,11 @@ datasets:
 # Llama 3.1 8B Logic
 Prompt the model to "use COT" and will think things out logically.
 ## Example (Trained)
 ### Instruction:
@@ -150,6 +155,96 @@ text_streamer = TextStreamer(tokenizer)
 _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 256)
 ```
 - **Developed by:** isaiahbjork
 - **License:** apache-2.0
 - **Finetuned from model :** unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit

 # Llama 3.1 8B Logic
 Prompt the model to "use COT" and will think things out logically.
+Basic Compound Words Evaluation (Below):
+- Accuracy: 86.00%
+- Correct predictions: 129
+- Total predictions: 150
 ## Example (Trained)
 ### Instruction:
 _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 256)
 ```
+# Evaluation - Google Colab
+```python
+import re
+import random
+from transformers import TextStreamer
+# Function to parse the model output and extract the predicted count
+def extract_count(output):
+    match = re.search(r'The letter "[a-z]" (?:appears|occurs|is found|exists) (\d+)', output)
+    if match:
+        return int(match.group(1))
+    return None
+# Function to generate test data
+def generate_test_data(num_words=150):
+    words = ["Airplane", "Airport", "Angelfish", "Antfarm", "Ballpark", "Beachball", "Bikerack", "Billboard", "Blackhole", "Blueberry", "Boardwalk", "Bodyguard", "Bookstore", "Bow Tie", "Brainstorm", "Busboy", "Cabdriver", "Candlestick", "Car wash", "Cartwheel", "Catfish", "Caveman", "Chocolate chip", "Crossbow", "Daydream", "Deadend", "Doghouse", "Dragonfly", "Dress shoes", "Dropdown", "Earlobe", "Earthquake", "Eyeballs", "Father-in-law", "Fingernail", "Firecracker", "Firefighter", "Firefly", "Firework", "Fishbowl", "Fisherman", "Fishhook", "Football", "Forget", "Forgive", "French fries", "Goodnight", "Grandchild", "Groundhog", "Hairband", "Hamburger", "Handcuff", "Handout", "Handshake", "Headband", "Herself", "High heels", "Honeydew", "Hopscotch", "Horseman", "Horseplay", "Hotdog", "Ice cream", "Itself", "Kickball", "Kickboxing", "Laptop", "Lifetime", "Lighthouse", "Mailman", "Midnight", "Milkshake", "Moonrocks", "Moonwalk", "Mother-in-law", "Movie theater", "Newborn", "Newsletter", "Newspaper", "Nightlight", "Nobody", "Northpole", "Nosebleed", "Outer space", "Over-the-counter", "Overestimate", "Paycheck", "Policeman", "Ponytail", "Post card", "Racquetball", "Railroad", "Rainbow", "Raincoat", "Raindrop", "Rattlesnake", "Rockband", "Rocketship", "Rowboat", "Sailboat", "Schoolbooks", "Schoolwork", "Shoelace", "Showoff", "Skateboard", "Snowball", "Snowflake", "Softball", "Solar system", "Soundproof", "Spaceship", "Spearmint", "Starfish", "Starlight", "Stingray", "Strawberry", "Subway", "Sunglasses", "Sunroof", "Supercharge", "Superman", "Superstar", "Tablespoon", "Tailbone", "Tailgate", "Take down", "Takeout", "Taxpayer", "Teacup", "Teammate", "Teaspoon", "Tennis shoes", "Throwback", "Timekeeper", "Timeline", "Timeshare", "Tugboat", "Tupperware", "Underestimate", "Uplift", "Upperclassman", "Uptown", "Video game", "Wallflower", "Waterboy", "Watermelon", "Wheelchair", "Without", "Workboots", "Worksheet"]
+    # "Airplane", "Airport", "Angelfish", "Antfarm", "Ballpark", "Beachball", "Bikerack", "Billboard", "Blackhole", "Blueberry", "Boardwalk", "Bodyguard", "Bookstore", "Bow Tie", "Brainstorm", "Busboy", "Cabdriver", "Candlestick", "Car wash", "Cartwheel", "Catfish", "Caveman", "Chocolate chip", "Crossbow", "Daydream", "Deadend", "Doghouse", "Dragonfly", "Dress shoes", "Dropdown", "Earlobe", "Earthquake", "Eyeballs", "Father-in-law", "Fingernail", "Firecracker", "Firefighter", "Firefly", "Firework", "Fishbowl", "Fisherman", "Fishhook", "Football", "Forget", "Forgive", "French fries", "Goodnight", "Grandchild", "Groundhog", "Hairband", "Hamburger", "Handcuff", "Handout", "Handshake", "Headband", "Herself", "High heels", "Honeydew", "Hopscotch", "Horseman", "Horseplay", "Hotdog", "Ice cream", "Itself", "Kickball", "Kickboxing", "Laptop", "Lifetime", "Lighthouse", "Mailman", "Midnight", "Milkshake", "Moonrocks", "Moonwalk", "Mother-in-law", "Movie theater", "Newborn", "Newsletter", "Newspaper", "Nightlight", "Nobody", "Northpole", "Nosebleed", "Outer space", "Over-the-counter", "Overestimate", "Paycheck", "Policeman", "Ponytail", "Post card", "Racquetball", "Railroad", "Rainbow", "Raincoat", "Raindrop", "Rattlesnake", "Rockband", "Rocketship", "Rowboat", "Sailboat", "Schoolbooks", "Schoolwork", "Shoelace", "Showoff", "Skateboard", "Snowball", "Snowflake", "Softball", "Solar system", "Soundproof", "Spaceship", "Spearmint", "Starfish", "Starlight", "Stingray", "Strawberry", "Subway", "Sunglasses", "Sunroof", "Supercharge", "Superman", "Superstar", "Tablespoon", "Tailbone", "Tailgate", "Take down", "Takeout", "Taxpayer", "Teacup", "Teammate", "Teaspoon", "Tennis shoes", "Throwback", "Timekeeper", "Timeline", "Timeshare", "Tugboat", "Tupperware", "Underestimate", "Uplift", "Upperclassman", "Uptown", "Video game", "Wallflower", "Waterboy", "Watermelon", "Wheelchair", "Without", "Workboots", "Worksheet"
+    letters = "aeioulprts"
+    test_data = []
+    for word in words[:num_words]:
+        letter = random.choice(letters)
+        actual_count = word.lower().count(letter)  # Use lower() to count case-insensitively
+        test_data.append((word, letter, actual_count))
+    return test_data
+# Alpaca prompt template
+alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+### Instruction:
+{0}
+### Input:
+{1}
+### Response:
+"""
+# Generate test data
+test_data = generate_test_data()
+# Run evaluation
+correct_predictions = 0
+total_predictions = 0
+for word, letter, actual_count in test_data:
+    input_text = f"How many {letter}'s in {word}?"
+    prompt = alpaca_prompt.format(
+        "You are an expert at logic puzzles, reasoning, and planning",
+        input_text,
+        ""
+    )
+    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
+    text_streamer = TextStreamer(tokenizer)
+    output = model.generate(**inputs, streamer=text_streamer, max_new_tokens=256)
+    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
+    predicted_count = extract_count(decoded_output)
+    total_predictions += 1
+    if predicted_count is not None:
+        if predicted_count == actual_count:
+            correct_predictions += 1
+    else:
+        # If predicted_count is None and actual_count is 0, consider it correct
+        if actual_count == 0:
+            correct_predictions += 1
+        print(f"Warning: Could not extract a count from the model's response for '{word}'.")
+    print(f"Word: {word}, Letter: {letter}")
+    print(f"Actual count: {actual_count}, Predicted count: {predicted_count}")
+    print("Correct" if (predicted_count == actual_count or (predicted_count is None and actual_count == 0)) else "Incorrect")
+    # Calculate and print accuracy after each word
+    current_accuracy = correct_predictions / total_predictions
+    print(f"Current Accuracy: {current_accuracy:.2%}")
+    print(f"Correct predictions: {correct_predictions}")
+    print(f"Total predictions: {total_predictions}")
+    print("---")
+# Calculate accuracy
+accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
+print(f"\nAccuracy: {accuracy:.2%}")
+print(f"Correct predictions: {correct_predictions}")
+print(f"Total predictions: {total_predictions}")
+```
 - **Developed by:** isaiahbjork
 - **License:** apache-2.0
 - **Finetuned from model :** unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit