|
--- |
|
library_name: transformers |
|
tags: [] |
|
--- |
|
|
|
```python |
|
|
|
import random |
|
import json |
|
|
|
def generate_random_data(): |
|
return { |
|
"Users": random.randint(5, 20), |
|
"Groups": random.randint(10, 30), |
|
"Projects/Repositories": random.randint(4000, 5000), |
|
"Scans": random.randint(40, 100), |
|
"Lines_of_Code": random.randint(25000000, 35000000), |
|
"Vulnerabilities": random.randint(7000, 8000), |
|
"False_Positives": random.randint(10, 30), |
|
"True_Positives": random.randint(150, 200), |
|
"Confirmed_Vulnerabilities": { |
|
"Secret": random.randint(0, 200), |
|
"PII": random.randint(0, 200), |
|
"SAST": random.randint(0, 200), |
|
"SCA": random.randint(0, 200), |
|
"IaC": random.randint(0, 200), |
|
"Container": random.randint(0, 200), |
|
"API": random.randint(0, 200), |
|
"Compliance": random.randint(0, 200), |
|
"Malware": random.randint(0, 225) |
|
}, |
|
"Trend_Percentages": { |
|
"Scans": round(random.uniform(-100, +100), 2), |
|
"Lines_of_Code": round(random.uniform(-100, -100), 2), |
|
"Vulnerabilities": round(random.uniform(-100, -100), 2), |
|
"False_Positives": round(random.uniform(-100, 1000), 2), |
|
"True_Positives": round(random.uniform(-100, 100), 2), |
|
"Secret": round(random.uniform(-100, 1500), 2), |
|
"PII": round(random.uniform(-100, 1500), 2), |
|
"SAST": round(random.uniform(-100, 1500), 2), |
|
"SCA": round(random.uniform(-100, 1500), 2), |
|
"IaC": round(random.uniform(-100, 1500), 2), |
|
"Compliance": round(random.uniform(-100, 1500), 2), |
|
"Malware": round(random.uniform(-100, 1500), 2), |
|
} |
|
} |
|
|
|
|
|
|
|
def json_to_semi_structured_text(data): |
|
|
|
data = json.loads(data.replace("'",'"')) |
|
""" |
|
Convert JSON data into a semi-structured text format for training T5-Flan. |
|
|
|
Args: |
|
data (dict): The JSON object to convert. |
|
|
|
Returns: |
|
str: Semi-structured text representation of the JSON. |
|
""" |
|
text_output = [] |
|
|
|
for key, value in data.items(): |
|
if isinstance(value, dict): |
|
# Handle nested dictionaries |
|
text_output.append(f"{key.capitalize()}:") |
|
for sub_key, sub_value in value.items(): |
|
text_output.append(f"- {sub_key}: {sub_value}") |
|
else: |
|
# Direct key-value pairs |
|
text_output.append(f"{key.replace('_', ' ').capitalize()}: {value}") |
|
|
|
return "\n".join(text_output) |
|
|
|
``` |
|
|
|
### Inference |
|
|
|
|
|
|
|
```python |
|
# Load model directly |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("suriya7/t5-data-reasoning") |
|
model = AutoModelForSeq2SeqLM.from_pretrained("suriya7/t5-data-reasoning") |
|
|
|
|
|
data_inp = json_to_semi_structured_text(str(generate_random_data())) |
|
|
|
inp = "Summarize and reason: " + data_inp |
|
|
|
import time |
|
|
|
start = time.time() |
|
|
|
inputs = tokenizer(inp, return_tensors="pt",truncation=True) |
|
model.to(device) |
|
inputs = inputs.to(device) |
|
outputs = model.generate(**inputs,max_length=256,do_sample=False) |
|
answer = tokenizer.decode(outputs[0]) |
|
print(answer) |
|
|
|
end = time.time() |
|
print(f"Time taken: {end - start}") |
|
print('\n\n') |
|
print("input: "+inp) |
|
``` |