t5-data-reasoning / README.md
suriya7's picture
Update README.md
1d80ddf verified
|
raw
history blame
3.32 kB
metadata
library_name: transformers
tags: []

import random
import json

def generate_random_data():
    return {
        "Users": random.randint(5, 20),
        "Groups": random.randint(10, 30),
        "Projects/Repositories": random.randint(4000, 5000),
        "Scans": random.randint(40, 100),
        "Lines_of_Code": random.randint(25000000, 35000000),
        "Vulnerabilities": random.randint(7000, 8000),
        "False_Positives": random.randint(10, 30),
        "True_Positives": random.randint(150, 200),
        "Confirmed_Vulnerabilities": {
            "Secret": random.randint(0, 200),
            "PII": random.randint(0, 200),
            "SAST": random.randint(0, 200),
            "SCA": random.randint(0, 200),
            "IaC": random.randint(0, 200),
            "Container": random.randint(0, 200),
            "API": random.randint(0, 200),
            "Compliance": random.randint(0, 200),
            "Malware": random.randint(0, 225)
        },
        "Trend_Percentages": {
            "Scans": round(random.uniform(-100, +100), 2),
            "Lines_of_Code": round(random.uniform(-100, -100), 2),
            "Vulnerabilities": round(random.uniform(-100, -100), 2),
            "False_Positives": round(random.uniform(-100, 1000), 2),
            "True_Positives": round(random.uniform(-100, 100), 2),
            "Secret": round(random.uniform(-100, 1500), 2),
            "PII": round(random.uniform(-100, 1500), 2),
            "SAST": round(random.uniform(-100, 1500), 2),
            "SCA": round(random.uniform(-100, 1500), 2),
            "IaC": round(random.uniform(-100, 1500), 2),
            "Compliance": round(random.uniform(-100, 1500), 2),
            "Malware": round(random.uniform(-100, 1500), 2),
        }
    }



def json_to_semi_structured_text(data):

    data = json.loads(data.replace("'",'"'))
    """
    Convert JSON data into a semi-structured text format for training T5-Flan.
    
    Args:
        data (dict): The JSON object to convert.
        
    Returns:
        str: Semi-structured text representation of the JSON.
    """
    text_output = []
    
    for key, value in data.items():
        if isinstance(value, dict):
            # Handle nested dictionaries
            text_output.append(f"{key.capitalize()}:")
            for sub_key, sub_value in value.items():
                text_output.append(f"- {sub_key}: {sub_value}")
        else:
            # Direct key-value pairs
            text_output.append(f"{key.replace('_', ' ').capitalize()}: {value}")
    
    return "\n".join(text_output)

Inference

# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("suriya7/t5-data-reasoning")
model = AutoModelForSeq2SeqLM.from_pretrained("suriya7/t5-data-reasoning")


data_inp = json_to_semi_structured_text(str(generate_random_data()))

inp = "Summarize and reason: " + data_inp

import time

start = time.time()

inputs = tokenizer(inp, return_tensors="pt",truncation=True)
model.to(device)
inputs = inputs.to(device)
outputs = model.generate(**inputs,max_length=256,do_sample=False)
answer = tokenizer.decode(outputs[0])
print(answer)

end = time.time()
print(f"Time taken: {end - start}")
print('\n\n')
print("input: "+inp)