Spaces:
Running
Running
""" | |
Dataset creation tools. | |
Keep to-level imports clean of non-trivial imports for specific tools, | |
because this file is imported for various purposes | |
""" | |
import ast | |
import concurrent.futures | |
import contextlib | |
import hashlib | |
import json | |
import os | |
import shutil | |
import signal | |
import sys | |
import traceback | |
from concurrent.futures import ProcessPoolExecutor | |
import psutil | |
import pytest | |
import pandas as pd | |
import numpy as np | |
from tqdm import tqdm | |
from utils import flatten_list, remove | |
def parse_rst_file(filepath): | |
with open(filepath, 'r') as f: | |
input_data = f.read() | |
settings_overrides = {'initial_header_level': 2} | |
from docutils import core | |
document = core.publish_doctree( | |
source=input_data, | |
source_path=filepath, | |
settings_overrides=settings_overrides, | |
) | |
qa_pairs = [] | |
current_section = None | |
current_question = "" | |
current_answer = "" | |
for node in document.traverse(): | |
if node.__class__.__name__ == 'section': | |
current_section = "" | |
elif current_section is not None: | |
if node.__class__.__name__ == 'Text': | |
if node.astext()[-1] == "?": | |
if current_question: | |
qa_pairs.append((current_question, current_answer)) | |
current_question = node.astext() | |
current_answer = "" | |
else: | |
current_answer += node.astext() | |
if current_answer: | |
qa_pairs.append((current_question, current_answer)) | |
return {k: v for k, v in qa_pairs} | |
def test_scrape_dai_docs(): | |
home = os.path.expanduser('~') | |
file = os.path.join(home, 'h2oai/docs/faq.rst') | |
qa_pairs = parse_rst_file(file) | |
prompt_type = 'human_bot' | |
from prompter import prompt_types | |
assert prompt_type in prompt_types | |
save_thing = [{"instruction": k, "output": v, 'prompt_type': prompt_type} for k, v in qa_pairs.items()] | |
output_file = "dai_faq.json" | |
with open(output_file, "wt") as f: | |
f.write(json.dumps(save_thing, indent=2)) | |
def test_scrape_dai_docs_all(): | |
""" | |
pytest create_data.py::test_scrape_dai_docs_all | |
""" | |
import glob | |
import nltk | |
nltk.download('punkt') | |
dd = {} | |
np.random.seed(1234) | |
home = os.path.expanduser('~') | |
files = list(glob.glob(os.path.join(home, "h2oai/docs/**/*rst"))) | |
np.random.shuffle(files) | |
val_count = int(0.05 * len(files)) | |
train_files = files[val_count:] | |
valid_files = files[:val_count] | |
things = [ | |
("dai_docs.train.json", train_files), | |
("dai_docs.valid.json", valid_files) | |
] | |
for LEN in [100, 200, 500]: | |
for output_file, ff in things: | |
if output_file not in dd: | |
dd[output_file] = [] | |
for f in ff: | |
with open(f) as input: | |
blob = input.read() | |
blob = blob.replace("~~", "") | |
blob = blob.replace("==", "") | |
blob = blob.replace("''", "") | |
blob = blob.replace("--", "") | |
blob = blob.replace("**", "") | |
dd[output_file].extend(get_sentences(blob, length=LEN)) | |
for output_file, _ in things: | |
save_thing = [{"output": k.strip(), 'prompt_type': 'plain'} for k in dd[output_file]] | |
with open(output_file, "wt") as f: | |
f.write(json.dumps(save_thing, indent=2)) | |
def get_sentences(blob, length): | |
""" | |
break-up input text into sentences and then output list of sentences of about length in size | |
:param blob: | |
:param length: | |
:return: | |
""" | |
import nltk | |
nltk.download('punkt') | |
from nltk.tokenize import sent_tokenize | |
sentences = sent_tokenize(blob) | |
my_sentences = [] | |
my_string = "" | |
for sentence in sentences: | |
if len(my_string) + len(sentence) <= length: | |
if my_string: | |
my_string += " " + sentence | |
else: | |
my_string = sentence | |
else: | |
my_sentences.append(my_string) | |
my_string = "" | |
return my_sentences or [my_string] | |
def setup_dai_docs(path=None, dst="working_dir_docs", from_hf=False): | |
""" | |
Only supported if have access to source code or HF token for HF spaces and from_hf=True | |
:param path: | |
:param dst: | |
:param from_hf: | |
:return: | |
""" | |
home = os.path.expanduser('~') | |
if from_hf: | |
# assumes | |
from huggingface_hub import hf_hub_download | |
# True for case when locally already logged in with correct token, so don't have to set key | |
token = os.getenv('HUGGING_FACE_HUB_TOKEN', True) | |
path_to_zip_file = hf_hub_download('h2oai/dai_docs', 'dai_docs.zip', token=token, repo_type='dataset') | |
path = 'h2oai' | |
import zipfile | |
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref: | |
zip_ref.extractall(path) | |
path = os.path.join(path, 'docs/**/*') | |
if path is None: | |
if os.path.isdir(os.path.join(home, 'h2oai')): | |
path = os.path.join(home, "h2oai/docs/**/*") | |
else: | |
assert os.path.isdir(os.path.join(home, 'h2oai.superclean')), '%s does not exist' % path | |
path = os.path.join(home, "h2oai.superclean/docs/**/*") | |
import glob | |
files = list(glob.glob(path, recursive=True)) | |
# pandoc can't find include files | |
remove(dst) | |
os.makedirs(dst) | |
# copy full tree, for absolute paths in rst | |
for fil in files: | |
if os.path.isfile(fil): | |
shutil.copy(fil, dst) | |
# hack for relative path | |
scorers_dir = os.path.join(dst, 'scorers') | |
makedirs(scorers_dir) | |
for fil in glob.glob(os.path.join(dst, '*.frag')): | |
shutil.copy(fil, scorers_dir) | |
return dst | |
def rst_to_outputs(files, min_len=30, max_len=2048 // 2 - 30): | |
# account for sequence length (context window) including prompt and input and output | |
# os.system('pandoc -f rst -t plain ./expert_settings/nlp_settings.rst') | |
import pypandoc | |
basedir = os.path.abspath(os.getcwd()) | |
outputs = [] | |
for fil in files: | |
os.chdir(basedir) | |
os.chdir(os.path.dirname(fil)) | |
fil = os.path.basename(fil) | |
print("Processing %s" % fil, flush=True) | |
# out_format can be one of: asciidoc, asciidoctor, beamer, biblatex, bibtex, commonmark, commonmark_x, | |
# context, csljson, docbook, docbook4, docbook5, docx, dokuwiki, | |
# dzslides, epub, epub2, epub3, fb2, gfm, haddock, html, html4, html5, icml, | |
# ipynb, jats, jats_archiving, jats_articleauthoring, jats_publishing, jira, | |
# json, latex, man, | |
# markdown, markdown_github, markdown_mmd, markdown_phpextra, markdown_strict, | |
# mediawiki, ms, muse, native, odt, opendocument, opml, org, pdf, plain, pptx, | |
# revealjs, rst, rtf, s5, slideous, slidy, tei, texinfo, textile, xwiki, zimwiki | |
out_format = 'plain' | |
# avoid extra new lines injected into text | |
extra_args = ['--wrap=preserve', '--resource path="%s" % dst'] | |
plain_list = [] | |
try: | |
# valid for expert settings | |
input_rst = pypandoc.convert_file(fil, 'rst') | |
input_list = input_rst.split('\n``') | |
for input_subrst in input_list: | |
input_plain = pypandoc.convert_text(input_subrst, format='rst', to='plain') | |
plain_list.append([input_plain, fil]) | |
except Exception as e: | |
print("file exception: %s %s" % (fil, str(e)), flush=True) | |
if not plain_list: | |
# if failed to process as pieces of rst, then | |
output = pypandoc.convert_file(fil, out_format, extra_args=extra_args, format='rst') | |
outputs1 = get_sentences(output, length=max_len) | |
for oi, output in enumerate(outputs1): | |
output = output.replace('\n\n', '\n') | |
plain_list.append([output, fil]) | |
outputs.extend(plain_list) | |
# report: | |
# [print(len(x)) for x in outputs] | |
# deal with blocks longer than context size (sequence length) of 2048 | |
new_outputs = [] | |
num_truncated = 0 | |
num_orig = len(outputs) | |
for output, fil in outputs: | |
if len(output) < max_len: | |
new_outputs.append([output, fil]) | |
continue | |
outputs1 = get_sentences(output, length=max_len) | |
for oi, output1 in enumerate(outputs1): | |
output1 = output1.replace('\n\n', '\n') | |
new_outputs.append([output1, fil]) | |
num_truncated += 1 | |
print('num_orig: %s num_truncated: %s' % (num_orig, num_truncated), flush=True) | |
new_outputs = [[k.strip(), fil] for k, fil in new_outputs if len(k.strip()) > min_len] | |
return new_outputs | |
def test_scrape_dai_docs_all_pandoc(): | |
""" | |
pytest -s -v create_data.py::test_scrape_dai_docs_all_pandoc | |
:return: | |
""" | |
dst = setup_dai_docs() | |
import glob | |
files = list(glob.glob(os.path.join(dst, '*rst'), recursive=True)) | |
basedir = os.path.abspath(os.getcwd()) | |
new_outputs = rst_to_outputs(files) | |
os.chdir(basedir) | |
remove(dst) | |
save_thing = [{"output": k.strip(), 'prompt_type': 'plain'} for k in new_outputs] | |
output_file = "dai_docs.train_cleaned.json" | |
with open(output_file, "wt") as f: | |
f.write(json.dumps(save_thing, indent=2)) | |
def test_config_to_json(): | |
""" | |
Needs to run from Driverless AI source directory. | |
E.g. (base) jon@gpu:~/h2oai$ pytest -s -v /data/jon/h2ogpt/create_data.py::test_config_to_json ; cp config.json /data/jon/h2ogpt/ | |
:return: | |
""" | |
try: | |
# Arrange | |
import json | |
from h2oaicore.systemutils import config | |
toml_list = [] | |
for k, v in config.get_meta_dict().items(): | |
title = (v.title + ": ") if v.title else '' | |
comment = v.comment or '' | |
if not (title or comment): | |
continue | |
toml_list.extend( | |
[ | |
{ | |
'prompt_type': 'plain', | |
'instruction': f"<human>: What does {k} do?\n<bot>: {k.replace('_', ' ')} config.toml: {comment or title}\n<human>:".replace( | |
"\n", ""), | |
}, | |
{ | |
'prompt_type': 'plain', | |
'instruction': f"<human>: Explain {k}.\n<bot>: {k.replace('_', ' ')} config.toml: {comment or title}\n<human>:".replace( | |
"\n", ""), | |
}, | |
{ | |
'prompt_type': 'plain', | |
'instruction': f"<human>: How can I do this: {title}.\n<bot>: Set the {k.replace('_', ' ')} config.toml\n<human>:".replace( | |
"\n", ""), | |
} if title and comment else None, | |
{ | |
'prompt_type': 'human_bot', | |
'instruction': f'Explain the following expert setting for Driverless AI', | |
'input': f"{k}", | |
'output': f"{k.replace('_', ' ')} config.toml: {comment or title}".replace("\n", ""), | |
}, | |
{ | |
'prompt_type': 'human_bot', | |
'instruction': f'Explain the following expert setting for Driverless AI', | |
'input': f"{k}", | |
'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""), | |
}, | |
{ | |
'prompt_type': 'human_bot', | |
'instruction': f'Explain the following expert setting for Driverless AI', | |
'input': f"{k.replace('_', ' ')}", | |
'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""), | |
}, | |
{ | |
'prompt_type': 'human_bot', | |
'instruction': f'Explain the following expert setting for Driverless AI', | |
'input': f"{title}", | |
'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""), | |
}, | |
{ | |
'prompt_type': 'human_bot', | |
'instruction': f'Provide a short explanation of the expert setting {k}', | |
'output': f"{k.replace('_', ' ')} config.toml: {comment or title}".replace("\n", ""), | |
}, | |
{ | |
'prompt_type': 'human_bot', | |
'instruction': f'Provide a detailed explanation of the expert setting {k}', | |
'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""), | |
}, | |
] | |
) | |
toml_list = [x for x in toml_list if x] | |
with open("config.json", "wt") as f: | |
f.write(json.dumps(toml_list, indent=2)) | |
except Exception as e: | |
print("Exception: %s" % str(e), flush=True) | |
def copy_tree(src, dst, follow_symlink=False): | |
makedirs(dst, exist_ok=True) | |
for (path, dirs, files) in os.walk(src, followlinks=follow_symlink): | |
new_path = path.replace(src, dst) | |
makedirs(new_path, exist_ok=True) | |
for file in files: | |
filename = os.path.join(path, file) | |
new_filename = os.path.join(new_path, file) | |
# print("%s -> %s" % (filename, new_filename)) | |
try: | |
atomic_copy(filename, new_filename) | |
except FileNotFoundError: | |
pass | |
def atomic_move(src, dst): | |
try: | |
shutil.move(src, dst) | |
except (shutil.Error, FileExistsError): | |
pass | |
remove(src) | |
def atomic_copy(src=None, dst=None, with_permissions=True): | |
if os.path.isfile(dst): | |
return | |
import uuid | |
my_uuid = uuid.uuid4() | |
dst_tmp = dst + str(my_uuid) | |
makedirs(os.path.dirname(dst), exist_ok=True) | |
if with_permissions: | |
shutil.copy(src, dst_tmp) | |
else: | |
shutil.copyfile(src, dst_tmp) | |
atomic_move(dst_tmp, dst) | |
remove(dst_tmp) | |
def makedirs(path, exist_ok=True): | |
""" | |
Avoid some inefficiency in os.makedirs() | |
:param path: | |
:param exist_ok: | |
:return: | |
""" | |
if os.path.isdir(path) and os.path.exists(path): | |
assert exist_ok, "Path already exists" | |
return path | |
os.makedirs(path, exist_ok=exist_ok) | |
## Download from https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_unfiltered_cleaned_split.json | |
## Turn into simple instruct prompt type. No context/previous conversations. | |
def test_prep_instruct_vicuna(): | |
from datasets import load_dataset | |
filename = 'ShareGPT_unfiltered_cleaned_split.json' | |
if not os.path.exists(filename): | |
os.system( | |
'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % filename) | |
data = load_dataset("json", data_files={"train": filename})["train"] | |
training_rows = [] | |
for i in range(data.num_rows): | |
conversations = data[i]['conversations'] | |
assert isinstance(conversations, list), conversations | |
convo = "" | |
for j, conv in enumerate(conversations): | |
# Get ready for generate.py prompt_type=human_bot | |
# But train with prompt_type=plain | |
if conv['from'] == 'human': | |
FROM = '<human>: ' | |
elif conv['from'] == 'gpt': | |
FROM = '<bot>: ' | |
convo += f"{FROM}" + conv['value'] + "\n" | |
if convo: | |
training_rows.append(dict(input=convo)) | |
with open(filename + ".generate_human_bot.train_plain.json", "wt") as f: | |
f.write(json.dumps(training_rows, indent=2)) | |
POSTFIX = ".generate_human_bot.train_plain.json" | |
# https://bair.berkeley.edu/blog/2023/04/03/koala/ | |
OIG_DATASETS = [ | |
"unified_chip2.jsonl", | |
"unified_grade_school_math_instructions.jsonl", | |
"unified_poetry_2_song.jsonl", | |
"unified_plot_screenplay_books_dialog.jsonl", | |
] | |
# hub issue: https://huggingface.co/datasets/laion/OIG/discussions/4 | |
ALL_OIG_DATASETS = ['unified_abstract_infill.jsonl', | |
'unified_basic.jsonl', | |
'unified_canadian_parliament.jsonl', | |
'unified_chip2.jsonl', | |
'unified_conv_finqa.jsonl', | |
'unified_cuad.jsonl', | |
'unified_essays.jsonl', | |
'unified_flan.jsonl.gz', | |
'unified_grade_school_math_instructions.jsonl', | |
'unified_hc3_human.jsonl', | |
'unified_image_prompts_instructions.jsonl', | |
'unified_joke_explanations.jsonl', | |
'unified_mathqa_flanv2_kojma_cot.jsonl', | |
'unified_merged_code_xp3.jsonl', | |
'unified_multi_news.jsonl', | |
'unified_multi_sum.jsonl', | |
'unified_ni.jsonl.gz', | |
'unified_nq.jsonl', | |
'unified_openai_summarize_tldr.jsonl', | |
'unified_oscar_en_sample_dialog.jsonl', | |
'unified_p3.jsonl.gz', | |
'unified_plot_screenplay_books_dialog.jsonl', | |
'unified_poetry_2_song.jsonl', | |
'unified_poetry_instructions.jsonl', | |
'unified_rallio_safety_and_prosocial.jsonl', | |
'unified_rallio_soda_upgraded_2048.jsonl', | |
'unified_soda_dialog.jsonl', | |
'unified_sqlv1.jsonl', | |
'unified_sqlv2.jsonl', | |
'unified_squad_v2.jsonl', | |
'unified_squad_v2_more_neg.jsonl', | |
'unified_ul2_plus_oscar_en_sample_dialog.jsonl', | |
'unified_unifiedskg_instructions.jsonl', | |
'unified_unnatural_instructions.jsonl', | |
'unified_xp3_sample.jsonl'] | |
useful_oig_files = ['unified_rallio_safety_and_prosocial.jsonl.parquet', | |
'unified_chip2.jsonl.parquet', | |
'unified_cuad.jsonl.parquet', | |
'unified_essays.jsonl.parquet', | |
'unified_flan.jsonl.gz.parquet', | |
'unified_grade_school_math_instructions.jsonl.parquet', | |
'unified_hc3_human.jsonl.parquet', | |
'unified_mathqa_flanv2_kojma_cot.jsonl.parquet', | |
'unified_merged_code_xp3.jsonl.parquet', | |
'unified_multi_news.jsonl.parquet', | |
# 'unified_multi_sum.jsonl.parquet' | |
'unified_ni.jsonl.gz.parquet', | |
'unified_openai_summarize_tldr.jsonl.parquet', | |
# 'unified_oscar_en_sample_dialog.jsonl.parquet', # create text containing these N words, not specific | |
'unified_plot_screenplay_books_dialog.jsonl.parquet', | |
'unified_soda_dialog.jsonl.parquet', | |
'unified_unnatural_instructions.jsonl.parquet', | |
] | |
def test_get_small_sample_oig_data(filename): | |
if not os.path.exists(filename): | |
os.system('wget https://huggingface.co/datasets/laion/OIG/resolve/main/%s' % filename) | |
import json | |
rows = [] | |
with open(filename, "r") as f: | |
for line in f.readlines(): | |
row = json.loads(line) | |
rows.append(dict(input=row["text"])) | |
with open(filename + POSTFIX, "w") as f: | |
f.write(json.dumps(rows, indent=2)) | |
def test_download_useful_data_as_parquet(filename): | |
dest_file = filename + '.parquet' | |
if dest_file not in useful_oig_files: | |
pytest.skip('file declared not useful') | |
if not os.path.exists(filename): | |
os.system('wget https://huggingface.co/datasets/laion/OIG/resolve/main/%s' % filename) | |
if not os.path.exists(dest_file): | |
df = pd.read_json(path_or_buf=filename, lines=True) | |
df.to_parquet(dest_file, index=False) | |
def test_merge_shuffle_small_sample_oig_data(): | |
np.random.seed(1234) | |
rows = [] | |
for filename in OIG_DATASETS: | |
with open(filename + POSTFIX, "r") as f: | |
rows.extend(json.loads(f.read())) | |
np.random.shuffle(rows) | |
with open("merged_shuffled_OIG_%s.json" % hashlib.sha256(str(OIG_DATASETS).encode()).hexdigest()[:10], "w") as f: | |
f.write(json.dumps(rows, indent=2)) | |
def test_join_jsons(): | |
files = ['config.json'] * 1 + \ | |
['dai_docs.train_cleaned.json'] * 2 + \ | |
['dai_faq.json'] * 3 | |
print(files) | |
lst = [] | |
[lst.extend(json.load(open(fil, 'rt'))) for fil in files] | |
print(len(lst)) | |
json.dump(lst, open("merged.json", "wt"), indent=2) | |
def test_make_rlhf_good_data(filename): | |
from datasets import load_dataset | |
rows = load_dataset(filename)["train"]["chosen"] | |
new_rows = [] | |
for row in rows: | |
if row[:2] == "\n\n": | |
row = row[2:] | |
row = row.replace("Human: ", "<human>: ") | |
row = row.replace("Assistant: ", "<bot>: ") | |
new_rows.append(dict(input=row)) | |
with open(filename.replace("/", "_") + POSTFIX, "w") as f: | |
f.write(json.dumps(new_rows, indent=2)) | |
def test_show_prompts(): | |
files = ['config.json'] * 1 + \ | |
['dai_docs.train_cleaned.json'] * 1 + \ | |
['dai_faq.json'] * 1 | |
file_points = [json.load(open(fil, 'rt')) for fil in files] | |
from prompter import generate_prompt | |
for data_points in file_points: | |
for data_point in data_points: | |
print(generate_prompt(data_point, 'plain', '', False, False, False)[0]) | |
def test_get_open_datasets(): | |
# HF changed things so don't get raw list of all datasets, so not have to filter, but can't do negative filter | |
open_tags = ['license:Apache License 2.0', | |
'license:mit', | |
'license:apache', | |
'license:apache2', | |
'license:apache-2.0', | |
'license:bsd', | |
'license:bsd-2-clause', | |
'license:bsd-3-clause', | |
'license:bsd-3-clause-clear', | |
'license:lgpl-2.1', | |
'license:lgpl-3.0', | |
'license:lgpl-lr', | |
'license:lgpl', | |
'license:openrail++', | |
'license:openrail', | |
'license:bigscience-bloom-rail-1.0', | |
# 'license:agpl-3.0', | |
'license:other', | |
'license:unknown', | |
# 'license:mpl-2.0', # ok, but would have to include original copyright, license, source, copies in distribution | |
# Attribution required: | |
'license:odc-by', | |
'license:cc-by-4.0', | |
'license:cc-by-3.0', | |
'license:cc-by-2.0', | |
'license:cc-by-2.5', | |
# 'license:cc-by-sa-4.0', # would require same license | |
'license:odbl', | |
'license:pddl', | |
'license:ms-pl', | |
'license:zlib', | |
] | |
# bad license: cc-by-nc-4.0 | |
from huggingface_hub import list_datasets | |
datasets = flatten_list([[x for x in list_datasets(filter=y)] for y in open_tags]) | |
datasets += [x for x in list_datasets(author='openai')] | |
# check all: | |
all_license_tags = set(flatten_list([[y for y in x.tags if 'license' in y] for x in datasets])) | |
print(len(all_license_tags)) | |
open_datasets = [x for x in datasets if any([y in x.tags for y in open_tags]) or 'license:' not in str(x.tags)] | |
print('open_datasets', len(open_datasets)) | |
all_task_tags = set(flatten_list([[y for y in x.tags if 'task' in y] for x in open_datasets])) | |
print('all_task_tags', len(all_task_tags)) | |
excluded_tags = ['image', 'hate', 'tabular', 'table-', 'classification', 'retrieval', | |
'translation', 'identification', 'object', 'mask', 'to-text', | |
'face-detection', 'audio', 'voice', 'reinforcement', 'depth-est', | |
'forecasting', 'parsing', 'visual', 'speech', 'multiple-choice', | |
'slot-filling', 'irds/argsme', '-scoring', 'other', 'graph-ml', | |
'feature-extraction', 'keyword-spotting', | |
'coreference-resolution', 'segmentation', | |
'word-sense-disambiguation', | |
'lemmatization'] | |
task_tags = [x.replace('task_categories:', '').replace('task_ids:', '') | |
for x in all_task_tags if not any([y in x for y in | |
excluded_tags])] | |
print('task_tags', len(task_tags)) | |
# str(x.tags) to catch any pattern match to anything in list | |
open_tasked_datasets = [x for x in open_datasets if | |
any([y in str([x for x in x.tags if 'task' in x]) for y in task_tags]) and | |
not any([y in str([x for x in x.tags if 'task' in x]) for y in excluded_tags]) or | |
'task_categories' not in str(x.tags) and 'task_ids' not in str(x.tags)] | |
open_tasked_datasets = [x for x in open_tasked_datasets if not x.disabled] | |
open_tasked_datasets = [x for x in open_tasked_datasets if not x.gated] | |
open_tasked_datasets = [x for x in open_tasked_datasets if not x.private] | |
print('open_tasked_datasets', len(open_tasked_datasets)) | |
sizes = list(set(flatten_list([[(y, x.id) for y in x.tags if 'size' in y] for x in open_tasked_datasets]))) | |
languages = list(set(flatten_list([[(y, x.id) for y in x.tags if 'language:' in y] for x in open_tasked_datasets]))) | |
open_english_tasked_datasets = [x for x in open_tasked_datasets if | |
'language:' not in str(x.tags) or | |
'language:en' in str(x.tags)] | |
small_open_english_tasked_datasets = [x for x in open_english_tasked_datasets if | |
'n<1K' in str(x.tags) or | |
'1K<n<10K' in str(x.tags) or | |
'1K0<n<100K' in str(x.tags) or | |
'100K<n<1M' in str(x.tags) or | |
'size_category' not in str(x.tags) | |
] | |
# 'aeslc' : email_body, subject -> summarization? | |
# load_dataset(open_tasked_datasets[0].id).data['train'].to_pandas() | |
ids = [x.id for x in small_open_english_tasked_datasets] | |
# sanity checks | |
# https://bair.berkeley.edu/blog/2023/04/03/koala/ | |
assert 'alespalla/chatbot_instruction_prompts' in ids | |
assert 'laion/OIG' in ids | |
assert 'openai/webgpt_comparisons' in ids | |
assert 'openai/summarize_from_feedback' in ids | |
assert 'Anthropic/hh-rlhf' in ids | |
# useful but not allowed for commercial purposes: | |
# https://huggingface.co/datasets/squad | |
print('open_english_tasked_datasets: ', ids, flush=True) | |
exclude_ids = ['allenai/nllb', # translation only | |
'hf-internal-testing/fixtures_image_utils', # testing | |
'allenai/c4', # search-url | |
'agemagician/uniref50', # unknown | |
'huggingface-course/documentation-images', # images | |
'smilegate-ai/kor_unsmile', # korean | |
'MohamedRashad/ChatGPT-prompts', # ChatGPT/LearnGPT/https://www.emergentmind.com/ | |
'humarin/chatgpt-paraphrases', # Paraphrase using ChatGPT | |
'Jeska/vaccinchat', # not useful | |
'alespalla/chatbot_instruction_prompts', # mixes alpaca | |
'allenai/prosocial-dialog', | |
# already exlucded, but wrongly in other datasets that say more permissive license | |
'AlekseyKorshuk/persona-chat', # low quality | |
'bavard/personachat_truecased', # low quality | |
'adamlin/daily_dialog', # medium quality conversations | |
'adamlin/FewShotWoz', # low quality | |
'benjaminbeilharz/better_daily_dialog', # low quality | |
'benjaminbeilharz/daily_dialog_w_turn_templates', # low | |
'benjaminbeilharz/empathetic_dialogues_for_lm', # low | |
'GEM-submissions/GEM__bart_base_schema_guided_dialog__1645547915', # NA | |
'ia-bentebib/conv_ai_2_fr', # low fr | |
'ia-bentebib/daily_dialog_fr', # low fr | |
'ia-bentebib/dialog_re_fr', # low fr | |
'ia-bentebib/empathetic_dialogues_fr', # low fr | |
'roskoN/dailydialog', # low | |
'VadorMazer/skyrimdialogstest', # low | |
'bigbio/med_qa', # med specific Q/A | |
'biu-nlp/qa_srl2018', # low quality Q/A | |
'biu-nlp/qa_discourse', # low quality Q/A | |
'iarfmoose/qa_evaluator', # low quality Q/A | |
'jeopardy', # low quality Q/A -- no reasoning | |
'narrativeqa', # low quality Q/A | |
'nomic-ai/gpt4all_prompt_generations', # bad license | |
'nomic-ai/gpt4all_prompt_generations_with_p3', # bad license | |
'HuggingFaceH4/alpaca', # bad license | |
'tatsu-lab/alpaca', # ToS breaking | |
'yahma/alpaca-cleaned', # ToS breaking | |
'Hello-SimpleAI/HC3', # bad license | |
'glue', # no reasoning QA | |
'sahil2801/CodeAlpaca-20k', # bad license | |
'Short-Answer-Feedback/saf_communication_networks_english', # long Q, medium A | |
] | |
small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if x.id not in exclude_ids] | |
# some ids clearly speech related | |
small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if 'speech' not in x.id] | |
# HF testing | |
small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if | |
'hf-internal-testing' not in x.id] | |
small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if | |
'chinese' not in x.id] | |
sorted_small_open_english_tasked_datasets = sorted([(x.downloads, x) for x in small_open_english_tasked_datasets], | |
key=lambda x: x[0], reverse=True) | |
# NOTES: | |
# Run like pytest -s -v create_data.py::test_get_open_datasets &> getdata9.log | |
# See what needs config passed and add: | |
# grep 'load_dataset(' getdata9.log|grep -v data_id|less -S | |
# grep "pip install" getdata9.log | |
# NOTE: Some datasets have default config, but others are there. Don't know how to access them. | |
""" | |
https://huggingface.co/datasets/wikihow/blob/main/wikihow.py | |
https://github.com/mahnazkoupaee/WikiHow-Dataset | |
https://ucsb.box.com/s/ap23l8gafpezf4tq3wapr6u8241zz358 | |
https://ucsb.app.box.com/s/ap23l8gafpezf4tq3wapr6u8241zz358 | |
""" | |
""" | |
# some ambiguous or non-commercial datasets | |
https://github.com/PhoebusSi/alpaca-CoT | |
""" | |
timeout = 3 * 60 | |
# laion/OIG takes longer | |
for num_downloads, dataset in sorted_small_open_english_tasked_datasets: | |
data_id = dataset.id | |
func = do_one | |
args = (data_id, num_downloads) | |
kwargs = {} | |
with ProcessPoolExecutor(max_workers=1) as executor: | |
future = executor.submit(func, *args, **kwargs) | |
try: | |
future.result(timeout=timeout) | |
except concurrent.futures.TimeoutError: | |
print("\n\ndata_id %s timeout\n\n" % data_id, flush=True) | |
for child in psutil.Process(os.getpid()).children(recursive=True): | |
os.kill(child.pid, signal.SIGINT) | |
os.kill(child.pid, signal.SIGTERM) | |
os.kill(child.pid, signal.SIGKILL) | |
def do_one(data_id, num_downloads): | |
from datasets import load_dataset | |
out_file = "data_%s.parquet" % str(data_id.replace('/', '_')) | |
if os.path.isfile(out_file) and os.path.getsize(out_file) > 1024 ** 3: | |
return | |
try: | |
print("Loading data_id %s num_downloads: %s" % (data_id, num_downloads), flush=True) | |
avail_list = None | |
try: | |
data = load_dataset(data_id, 'foobar') | |
except Exception as e: | |
if 'Available: ' in str(e): | |
avail_list = ast.literal_eval(str(e).split('Available:')[1].strip()) | |
else: | |
avail_list = None | |
if avail_list is None: | |
avail_list = [None] | |
print("%s avail_list: %s" % (data_id, avail_list), flush=True) | |
for name in avail_list: | |
out_file = "data_%s_%s.parquet" % (str(data_id.replace('/', '_')), str(name)) | |
if os.path.isfile(out_file): | |
continue | |
data = load_dataset(data_id, name) | |
column_names_dict = data.column_names | |
column_names = column_names_dict[list(column_names_dict.keys())[0]] | |
print("Processing data_id %s num_downloads: %s columns: %s" % (data_id, num_downloads, column_names), | |
flush=True) | |
data_dict = data.data | |
col_dict = data.num_columns | |
first_col = list(col_dict.keys())[0] | |
if 'train' in data_dict: | |
df = data['train'].to_pandas() | |
else: | |
df = data[first_col].to_pandas() | |
# csv has issues with escaping chars, even for datasets I know I want | |
df.to_parquet(out_file, index=False) | |
except Exception as e: | |
t, v, tb = sys.exc_info() | |
ex = ''.join(traceback.format_exception(t, v, tb)) | |
print("Exception: %s %s" % (data_id, ex), flush=True) | |
def test_otherlic(): | |
from huggingface_hub import list_datasets | |
lic = ['license:odc-by', | |
'license:cc-by-4.0', | |
'license:cc-by-3.0', | |
'license:cc-by-2.0', | |
'license:cc-by-2.5', | |
'license:cc-by-sa-4.0', | |
'license:odbl', | |
'license:pddl', | |
'license:ms-pl', | |
'license:zlib', | |
] | |
datasets = flatten_list([[x for x in list_datasets(filter=y) if 'translation' not in str(x.tags)] for y in lic]) | |
print(len(datasets)) | |
# These useful datasets are determined based upon data sample, column types, and uniqueness compared to larger datasets like Pile | |
# grep columns getdata13.log|grep -v "\['image'\]"|sort|uniq|grep -v tokens|grep -v "'image'"|grep -v embedding|grep dialog | |
useful = ['Dahoas/instruct-human-assistant-prompt', | |
'Dahoas/first-instruct-human-assistant-prompt', | |
'knkarthick/dialogsum', # summary of conversation | |
'McGill-NLP/FaithDial', # medium quality | |
'Zaid/quac_expanded', # medium quality context + QA | |
'0-hero/OIG-small-chip2', # medium | |
'alistvt/coqa-flat', # QA medium | |
'AnonymousSub/MedQuAD_47441_Question_Answer_Pairs', # QA medium | |
'Anthropic/hh-rlhf', # high quality # similar to Dahoas/full-hh-rlhf | |
'arjunth2001/online_privacy_qna', # good quality QA | |
'Dahoas/instruct_helpful_preferences', # medium quality instruct | |
'Dahoas/rl-prompt-dataset', # medium chat | |
'Dahoas/rm-static', # medium chat | |
'Dahoas/static-hh', # medium chat # HuggingFaceH4/self_instruct | |
'Dahoas/synthetic-instruct-gptj-pairwise', # medium chat | |
'eli5', # QA if prompt ELI5 | |
'gsm8k', # QA (various) | |
'guanaco/guanaco', # prompt/response | |
'kastan/rlhf-qa-comparisons', # good QA | |
'kastan/rlhf-qa-conditional-generation-v2', # prompt answer | |
'OllieStanley/humaneval-mbpp-codegen-qa', # code QA, but started from words, so better than other code QA | |
'OllieStanley/humaneval-mbpp-testgen-qa', # code QA | |
'Graverman/Instruct-to-Code', # code QA | |
'openai/summarize_from_feedback', # summarize | |
'relbert/analogy_questions', # analogy QA | |
'yitingxie/rlhf-reward-datasets', # prompt, chosen, rejected. | |
'yizhongw/self_instruct', # instruct (super natural & instruct) | |
'HuggingFaceH4/asss', # QA, big A | |
'kastan/rlhf-qa-conditional-generation-v2', # QA | |
'cosmos_qa', # context QA | |
'vishal-burman/c4-faqs', # QA but not so much reasoning, but alot of text | |
'squadshifts', # QA from context | |
'hotpot_qa', # QA from context | |
'adversarial_qa', # QA from context | |
'allenai/soda', # dialog -> narrative/summary | |
'squad_v2', # context QA | |
'squadshifts', # context QA | |
'dferndz/cSQuAD1', # context QA | |
'dferndz/cSQuAD2', # context QA | |
'din0s/msmarco-nlgen', # context QA | |
'domenicrosati/TruthfulQA', # common sense truthful QA -- trivia but good trivia | |
'hotpot_qa', # context, QA | |
'HuggingFaceH4/self-instruct-eval', # instruct QA, medium quality, some language reasoning | |
'kastan/EE_QA_for_RLHF', # context QA | |
'KK04/LogicInference_OA', # instruction logical QA | |
'lmqg/qa_squadshifts_synthetic', # context QA | |
'lmqg/qg_squad', # context QA | |
'lmqg/qg_squadshifts', # context QA | |
'lmqg/qg_subjqa', # context QA | |
'pszemraj/HC3-textgen-qa', | |
# QA medium, has human responses -- humans tend to provide links instead of trying to answer | |
'pythonist/newdata', # long context, QA, brief A | |
'ropes', # long background, situation, question, A | |
'wikitablequestions', # table -> QA | |
'bigscience/p3', # context QA but short answers | |
] | |
code_useful = ['0n1xus/codexglue', | |
'openai_humaneval', | |
'koutch/staqc', | |
] | |
maybe_useful = ['AlekseyKorshuk/comedy-scripts', | |
'openbookqa', # hard to parse, low reasoning | |
'qed', # reasonable QA, but low reasoning | |
'selqa', # candidate answers | |
'HuggingFaceH4/instruction-pilot-outputs-filtered', | |
'GBaker/MedQA-USMLE-4-options', # medical QA with long questions | |
'npc-engine/light-batch-summarize-dialogue', # dialog summarize, kinda low specific quality | |
] | |
summary_useful = ['austin/rheum_abstracts', | |
'CarperAI/openai_summarize_comparisons', # summarize chosen/rejected | |
'CarperAI/openai_summarize_tldr', # summarize QA | |
'ccdv/cnn_dailymail', # summarize news | |
'ccdv/govreport-summarization', # summarize high quality | |
'ccdv/pubmed-summarization', # summarize high quality | |
'duorc', # plot -> QA | |
'farleyknight/big_patent_5_percent', # desc -> abstract | |
'multi_news', # summary | |
'opinosis', | |
'SophieTr/reddit_clean', | |
'allenai/mup', # long text -> summary | |
'allenai/multi_lexsum', # long text -> summary | |
'big_patent', | |
'allenai/wcep_dense_max', | |
'awinml/costco_long_practice', | |
'GEM/xsum', | |
'ratishsp/newshead', | |
'RussianNLP/wikiomnia', # russian | |
'stacked-summaries/stacked-xsum-1024', | |
] | |
math_useful = [ | |
'competition_math' | |
] | |
skipped = ['c4', # maybe useful, used for flan, but skipped due to size | |
] | |
""" | |
To get training data from oig: | |
pytest test_oig test_grade_final test_finalize_to_json | |
""" | |
human = '<human>:' | |
bot = '<bot>:' | |
def test_assemble_and_detox(): | |
import re | |
from profanity_check import predict_prob | |
df_list = [] | |
for data in useful_oig_files: | |
print("Processing %s" % data, flush=True) | |
df = pd.read_parquet(data) | |
df = df.reset_index(drop=True) | |
# chop up into human/bot interactions of no more than 10kB per row | |
text_list = df[['text']].values.ravel().tolist() | |
new_text = [] | |
max_len = 2048 # uber cutoff | |
MAX_LEN = 2048 // 2 - 30 # max len per question/answer | |
for text in tqdm(text_list): | |
human_starts = [m.start() for m in re.finditer('<human>: ', text)] | |
if len(human_starts) == 1: | |
human_starts = [0, len(text)] # always go into for loop below | |
blurb = '' | |
for i in range(len(human_starts) - 1): | |
interaction = text[human_starts[i]: human_starts[i + 1]][:max_len] | |
blurb += interaction | |
if len(blurb) >= MAX_LEN: | |
blurb = get_sentences(blurb, length=MAX_LEN)[0] | |
new_text.append(blurb + "\n<human>:") | |
blurb = '' | |
if blurb: | |
blurb = get_sentences(blurb, length=MAX_LEN)[0] | |
new_text.append(blurb + "\n<human>:") | |
if len(new_text) > len(text_list): | |
print("Added %d new rows (before: %d)" % (len(new_text) - df.shape[0], df.shape[0])) | |
df = pd.DataFrame({"text": new_text, "source": [data] * len(new_text)}) | |
df = df.drop_duplicates(keep='first') | |
print(df['text'].apply(lambda x: len(x)).describe()) | |
assert df['text'].apply(lambda x: len(x)).max() <= 2 * max_len | |
# faster than better_profanity, do early | |
df['profanity'] = predict_prob(df['text']) | |
before_rows = df.shape[0] | |
df = df[df['profanity'] < 0.25] # drop any low quality stuff | |
after_rows = df.shape[0] | |
print("Dropped %d rows out of %d due to alt-profanity-check" % (before_rows - after_rows, before_rows)) | |
df_list.append(df) | |
print("Done processing %s -> %s rows" % (data, df.shape[0]), flush=True) | |
print("So far have %d rows" % sum([len(x) for x in df_list])) | |
df_final = pd.concat(df_list) | |
df_final = df_final.sample(frac=1, random_state=1234).reset_index(drop=True) | |
df_final.to_parquet('h2oGPT.cleaned.human_bot.shorter.parquet', index=False) | |
def test_basic_cleaning(): | |
# from better_profanity import profanity | |
# https://pypi.org/project/alt-profanity-check/ | |
from profanity_check import predict | |
df_list = [] | |
for data in useful_oig_files: | |
# for data in useful_oig_files[:5]: | |
# for data in ['unified_openai_summarize_tldr.jsonl.parquet']: | |
print("Processing %s" % data, flush=True) | |
df = pd.read_parquet(data) | |
df = df.reset_index(drop=True) | |
# NOTE: Not correct if multiple human-bot interactions, but those dialogs even more desired | |
# avg_chars = len(df['text'][0])/(df['text'][0].count(human)+df['text'][0].count(bot)) | |
df['avg_words'] = df['text'].apply(lambda x: x.count(' ') / (x.count(human) + x.count(bot)) / 2.0) | |
df['avg_bot_words'] = df['text'].apply(lambda x: x.split(bot)[1].count(' ') / x.count(bot)) | |
# df['bad_words'] = df['text'].apply(lambda x: profanity.contains_profanity(x)) | |
# low_quality_patterns = ['Write the rest of this wikipedia article'] | |
res = predict(df['text']) | |
df['bad_words'] = res | |
df = df.reset_index(drop=True) | |
df = df[df['bad_words'] == 0] | |
df = df[['text', 'avg_words', 'avg_bot_words']] | |
df = df.drop_duplicates(keep='first') | |
print(df[df['avg_words'] == df['avg_words'].max()]['text'].values) | |
median_words = np.median(df['avg_words']) | |
min_words_per_entity = max(30, 0.8 * median_words) | |
max_words_per_entity = 2048 # too hard to learn from for now | |
df = df[df['avg_words'] > min_words_per_entity] | |
df = df[df['avg_words'] < max_words_per_entity] | |
min_words_per_entity = max(20, 0.5 * median_words) # bot should say stuff for now | |
max_words_per_entity = 2048 # too hard to learn from for now | |
df = df[df['avg_bot_words'] > min_words_per_entity] | |
df = df[df['avg_bot_words'] < max_words_per_entity] | |
df_list.append(df) | |
print("Done processing %s -> %s rows" % (data, df.shape[0]), flush=True) | |
df_final = pd.concat(df_list) | |
df_final.to_parquet('h2oGPT.cleaned.human_bot.parquet', index=False) | |
from joblib import Parallel, delayed, effective_n_jobs | |
from sklearn.utils import gen_even_slices | |
from sklearn.utils.validation import _num_samples | |
def parallel_apply(df, func, n_jobs=-1, **kwargs): | |
""" Pandas apply in parallel using joblib. | |
Uses sklearn.utils to partition input evenly. | |
Args: | |
df: Pandas DataFrame, Series, or any other object that supports slicing and apply. | |
func: Callable to apply | |
n_jobs: Desired number of workers. Default value -1 means use all available cores. | |
**kwargs: Any additional parameters will be supplied to the apply function | |
Returns: | |
Same as for normal Pandas DataFrame.apply() | |
""" | |
if effective_n_jobs(n_jobs) == 1: | |
return df.apply(func, **kwargs) | |
else: | |
ret = Parallel(n_jobs=n_jobs)( | |
delayed(type(df).apply)(df[s], func, **kwargs) | |
for s in gen_even_slices(_num_samples(df), effective_n_jobs(n_jobs))) | |
return pd.concat(ret) | |
def add_better_profanity_flag(df): | |
from better_profanity import profanity | |
df['better_profanity'] = parallel_apply( | |
df['text'], | |
lambda x: profanity.contains_profanity(x), | |
n_jobs=-1, | |
) | |
return df | |
def add_textstat_grade(df): | |
import textstat | |
def myfunc(x): | |
return textstat.flesch_kincaid_grade(x) # simple grade | |
if False: | |
import dask.dataframe as dd | |
# 40 seconds for 1000 rows, but have 1,787,799 rows | |
ddata = dd.from_pandas(df, npartitions=120) | |
df['flesch_grade'] = ddata['text'].apply(myfunc).compute() | |
if True: | |
# fast way | |
df['flesch_grade'] = parallel_apply(df['text'], myfunc, n_jobs=-1) | |
return df | |
def add_deberta_grade(df): | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
import torch | |
reward_name = "OpenAssistant/reward-model-deberta-v3-large-v2" | |
rank_model, tokenizer = AutoModelForSequenceClassification.from_pretrained( | |
reward_name), AutoTokenizer.from_pretrained(reward_name) | |
device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
rank_model.to(device) | |
def get_question(x): | |
return x.replace('<human>: ', '').split('<bot>:')[0] | |
def get_answer(x): | |
try: | |
answer = x.split('<bot>: ')[1].split('<human>:')[0].replace('<bot>: ', '') | |
except: | |
answer = x.split('<bot>:')[1].split('<human>:')[0].replace('<bot>:', '') | |
return answer | |
df['question'] = parallel_apply(df['text'], get_question, n_jobs=-1) | |
df['answer'] = parallel_apply(df['text'], get_answer, n_jobs=-1) | |
from datasets import Dataset | |
from transformers import pipeline | |
from transformers.pipelines.pt_utils import KeyPairDataset | |
import tqdm | |
pipe = pipeline( | |
"text-classification", | |
model=reward_name, | |
device="cuda:0" if torch.cuda.is_available() else "cpu" | |
) | |
start = 0 | |
batch_size = 64 * 16 | |
micro_batch = orig_micro_batch = 16 | |
end = 0 | |
import socket | |
checkpoint = "grades.%s.pkl" % socket.gethostname() | |
grades = [] | |
import pickle | |
if os.path.exists(checkpoint): | |
with open(checkpoint, "rb") as f: | |
start, grades = pickle.loads(f.read()) | |
last_oom = 0 | |
while end < df.shape[0]: | |
# manual batching to handle OOM more gracefully | |
end = min(start + batch_size, df.shape[0]) | |
if start == end: | |
break | |
dataset = Dataset.from_pandas(df.iloc[start:end, :]) | |
try: | |
grades.extend([ | |
x['score'] for x in tqdm.tqdm( | |
pipe(KeyPairDataset(dataset, "question", "answer"), batch_size=micro_batch) | |
) | |
]) | |
except torch.cuda.OutOfMemoryError: | |
last_oom = start | |
micro_batch = max(1, micro_batch // 2) | |
print("OOM - retrying with micro_batch=%d" % micro_batch) | |
continue | |
if last_oom == start: | |
micro_batch = orig_micro_batch | |
print("Returning to micro_batch=%d" % micro_batch) | |
assert len(grades) == end | |
start = end | |
with open(checkpoint, "wb") as f: | |
f.write(pickle.dumps((end, grades))) | |
print("%d/%d" % (end, df.shape[0])) | |
df['grade_deberta'] = grades | |
if os.path.exists(checkpoint): | |
os.remove(checkpoint) | |
return df | |
def test_chop_by_lengths(): | |
file = "h2oGPT.cleaned.human_bot.shorter.parquet" | |
df = pd.read_parquet(file).reset_index(drop=True) | |
df = count_human_bot_lengths(df) | |
df['rand'] = np.random.rand(df.shape[0]) | |
df['rand2'] = np.random.rand(df.shape[0]) | |
before_rows = df.shape[0] | |
# throw away short human/bot responses with higher likelihood | |
df = df[(df['len_human_mean'] > 20)] # never keep very short ones | |
df = df[(df['len_human_mean'] > 30) | (df['rand'] < 0.2)] | |
df = df[(df['len_human_mean'] > 50) | (df['rand'] < 0.5)] | |
df = df[(df['len_human_max'] < 10000)] # drop super long (basically only human) ones | |
df = df[(df['len_bot_mean'] > 20)] # never keep very short ones | |
df = df[(df['len_bot_mean'] > 30) | (df['rand2'] < 0.2)] | |
df = df[(df['len_bot_mean'] > 50) | (df['rand2'] < 0.5)] | |
df = df[(df['len_bot_max'] < 10000)] # drop super long (only bot) ones | |
assert df['text'].apply(lambda x: len(x)).max() < 20000 | |
df = df.drop(['rand', 'rand2'], axis=1) | |
after_rows = df.shape[0] | |
print("Chopped off %d out of %d rows due to length" % (before_rows - after_rows, before_rows)) | |
print(df.describe()) | |
df.to_parquet('h2oGPT.cleaned.chopped.human_bot.shorter.parquet', index=False) | |
def count_human_bot_lengths(df, human=None, bot=None): | |
import re | |
len_human_min = [] | |
len_human_max = [] | |
len_human_mean = [] | |
len_bot_min = [] | |
len_bot_max = [] | |
len_bot_mean = [] | |
human = human or '<human>:' | |
bot = bot or '<bot>:' | |
for is_human in [True, False]: | |
what = human if is_human else bot | |
other = human if not is_human else bot | |
for i in range(df.shape[0]): | |
text = df.loc[i, 'text'] | |
assert isinstance(text, str) | |
starts = [m.start() for m in re.finditer(what, text)] | |
if len(starts) == 1: | |
starts = [starts[0], len(text)] # always go into for loop below | |
assert len(text) | |
list_what = [] | |
for ii in range(len(starts) - 1): | |
interaction = text[starts[ii]: starts[ii + 1]] | |
if other in interaction: | |
interaction = interaction[:interaction.find(other)] | |
interaction.strip() | |
list_what.append(interaction) | |
if not list_what: | |
list_what = [''] # handle corrupted data, very rare, leads to sizes 0 | |
if is_human: | |
len_human_min.append(min([len(x) for x in list_what])) | |
len_human_max.append(max([len(x) for x in list_what])) | |
len_human_mean.append(np.mean([len(x) for x in list_what])) | |
else: | |
len_bot_min.append(min([len(x) for x in list_what])) | |
len_bot_max.append(max([len(x) for x in list_what])) | |
len_bot_mean.append(np.mean([len(x) for x in list_what])) | |
df['len_human_min'] = len_human_min | |
df['len_human_max'] = len_human_max | |
df['len_human_mean'] = len_human_mean | |
df['len_bot_min'] = len_bot_min | |
df['len_bot_max'] = len_bot_max | |
df['len_bot_mean'] = len_bot_mean | |
np.random.seed(1234) | |
pd.set_option('display.max_columns', None) | |
print("Before chopping") | |
print(df.describe()) | |
return df | |
def test_grade(): | |
df = None | |
file = "h2oGPT.cleaned.chopped.human_bot.shorter.parquet" | |
output_file = "h2oGPT.cleaned.graded1.human_bot.shorter.parquet" | |
if not os.path.exists(output_file): | |
if df is None: | |
df = pd.read_parquet(file).reset_index(drop=True) | |
df = add_textstat_grade(df) | |
min_grade = 10 | |
max_grade = 25 | |
df = df[df['flesch_grade'] >= min_grade] | |
df = df[df['flesch_grade'] <= max_grade] | |
print("After Flesch grade") | |
print(df.describe()) | |
df.to_parquet(output_file, index=False) | |
file = output_file | |
output_file = "h2oGPT.cleaned.graded2.human_bot.shorter.parquet" | |
if not os.path.exists(output_file): | |
# slower than alt-profanity, do last, but do before deberta grading, since that's slower | |
if df is None: | |
df = pd.read_parquet(file).reset_index(drop=True) | |
df = add_better_profanity_flag(df) | |
before_rows = df.shape[0] | |
df = df[df['better_profanity'] == 0] | |
df = df.drop(['better_profanity'], axis=1) | |
after_rows = df.shape[0] | |
print("Dropped %d rows out of %d due to better_profanity" % (before_rows - after_rows, before_rows)) | |
print(df.describe()) | |
df.to_parquet(output_file, index=False) | |
file = output_file | |
output_file = 'h2oGPT.cleaned.graded3.human_bot.shorter.parquet' | |
if not os.path.exists(output_file): | |
if df is None: | |
df = pd.read_parquet(file).reset_index(drop=True) | |
df = add_deberta_grade(df) | |
min_grade = 0.3 | |
max_grade = np.inf | |
before_rows = df.shape[0] | |
df = df[df['grade_deberta'] >= min_grade] | |
df = df[df['grade_deberta'] <= max_grade] | |
after_rows = df.shape[0] | |
print("Dropped %d rows out of %d due to deberta grade" % (before_rows - after_rows, before_rows)) | |
print("After DeBERTa grade") | |
print(df.describe()) | |
df.to_parquet(output_file, index=False) | |
file = output_file | |
output_file = 'h2oGPT.cleaned.graded.human_bot.shorter.parquet' | |
if df is None: | |
df = pd.read_parquet(file).reset_index(drop=True) | |
df.to_parquet(output_file, index=False) | |
def test_add_open_assistant(fixup_personality, only_personality, deberta_grading, prompt_type, save_json=True): | |
""" | |
Flatten tree structure into one row per path from root to leaf | |
Also turn into human_bot prompting format: | |
<human>: question\n<bot>: answer <human>: question2\n<bot>: answer2 Etc. | |
Also saves a .json locally as side-effect | |
returns list of dicts, containing intput, prompt_type and source | |
""" | |
from datasets import load_dataset | |
data_file = "OpenAssistant/oasst1" | |
ds = load_dataset(data_file) | |
df = pd.concat([ds['train'].to_pandas(), ds['validation'].to_pandas()], axis=0) | |
rows = {} | |
message_ids = df['message_id'].values.tolist() | |
message_tree_ids = df['message_tree_id'].values.tolist() | |
parent_ids = df['parent_id'].values.tolist() | |
texts = df['text'].values.tolist() | |
roles = df['role'].values.tolist() | |
deleteds = df['deleted'].values.tolist() | |
for i in range(df.shape[0]): | |
# collect all trees | |
message_id = message_ids[i] | |
message_tree_id = message_tree_ids[i] | |
parent_id = parent_ids[i] | |
text = texts[i] | |
deleted = deleteds[i] | |
if deleted: | |
continue | |
if fixup_personality: | |
text = text.replace("Open Assistant", "h2oGPT") | |
text = text.replace("Open-Assistant", "h2oGPT") | |
text = text.replace("open-assistant", "h2oGPT") | |
text = text.replace("OpenAssistant", "h2oGPT") | |
text = text.replace("open assistant", "h2oGPT") | |
text = text.replace("Open Assistand", "h2oGPT") | |
text = text.replace("Open Assitant", "h2oGPT") | |
text = text.replace("Open Assistent", "h2oGPT") | |
text = text.replace("Open Assisstant", "h2oGPT") | |
text = text.replace("Open Assitent", "h2oGPT") | |
text = text.replace("Open Assitiant", "h2oGPT") | |
text = text.replace("Open Assistiant", "h2oGPT") | |
text = text.replace("Open Assitan ", "h2oGPT ") | |
text = text.replace("Open Assistan ", "h2oGPT ") | |
text = text.replace("Open Asistant", "h2oGPT") | |
text = text.replace("Open Assiant", "h2oGPT") | |
text = text.replace("Assistant", "h2oGPT") | |
text = text.replace("LAION AI", "H2O.ai") | |
text = text.replace("LAION-AI", "H2O.ai") | |
text = text.replace("LAION,", "H2O.ai,") | |
text = text.replace("LAION.ai", "H2O.ai") | |
text = text.replace("LAION.", "H2O.ai.") | |
text = text.replace("LAION", "H2O.ai") | |
role = roles[i] | |
if prompt_type == "llama2": | |
new_data = ('[INST] ' if role == 'prompter' else ' [/INST] ') + text | |
if parent_id and role == 'prompter': | |
new_data = " " + new_data | |
elif prompt_type == "human_bot": | |
new_data = ('<human>: ' if role == 'prompter' else '<bot>: ') + text | |
else: | |
raise NotImplementedError("prompt_type not supported") | |
entry = dict(message_id=message_id, parent_id=parent_id, text=new_data) | |
if message_tree_id not in rows: | |
rows[message_tree_id] = [entry] | |
else: | |
rows[message_tree_id].append(entry) | |
all_rows = [] | |
for node_id in rows: | |
# order responses in tree, based on message/parent relationship | |
conversations = [] | |
list_msgs = rows[node_id] | |
# find start | |
while len(list_msgs): | |
for i, leaf in enumerate(list_msgs): | |
found = False | |
parent_id = leaf['parent_id'] | |
if parent_id is None: | |
# conversation starter | |
conversations.append(leaf) | |
found = True | |
else: | |
for conv in conversations: | |
# find all conversations to add my message to | |
if parent_id in conv['message_id'] and parent_id != conv['message_id'][-len(parent_id):]: | |
# my message doesn't follow conversation | |
continue | |
if parent_id == conv['message_id'][-len(parent_id):]: | |
# my message follows conversation, but fork first, so another follow-on message can do same | |
conversations.append(conv.copy()) | |
if prompt_type == "llama2": | |
conv['text'] += f"""{leaf['text']}""" | |
elif prompt_type == "human_bot": | |
conv['text'] += f""" | |
{leaf['text']} | |
""" | |
else: | |
raise NotImplementedError | |
conv['message_id'] += leaf['message_id'] | |
found = True | |
break | |
if found: | |
# my content was used, so nuke from list | |
del list_msgs[i] | |
break | |
# now reduce down to final conversations, find the longest chains of message ids | |
for i, conv in enumerate(conversations): | |
for j, conv2 in enumerate(conversations): | |
if i == j: | |
continue | |
if conv['message_id'] and conv2['message_id']: | |
assert conv['message_id'] != conv2['message_id'] | |
# delete the shorter conversation, if one contains the other | |
if conv['message_id'] in conv2['message_id']: | |
conv['message_id'] = None | |
if conv2['message_id'] in conv['message_id']: | |
conv2['message_id'] = None | |
conversations = [c for c in conversations if c['message_id']] | |
if only_personality: | |
if prompt_type == "human_bot": | |
all_rows.extend( | |
[dict(input=c['text'] + "\n<human>:", output="", prompt_type='plain', source=data_file) for c in conversations if | |
'h2oGPT' in c['text']]) | |
elif prompt_type == "llama2": | |
all_rows.extend( | |
[dict(input=c['text'] + | |
("" if c['text'].rfind("[/INST]") > c['text'].rfind("[INST]") else " [/INST]"), | |
output="", prompt_type='plain', source=data_file) for c in conversations if | |
'h2oGPT' in c['text']]) | |
else: | |
raise NotImplementedError | |
else: | |
if prompt_type == "human_bot": | |
all_rows.extend( | |
[dict(input=c['text'] + "\n<human>:", output="", prompt_type='plain', source=data_file) for c in conversations | |
if | |
"What is H2O.ai" not in c['text']]) | |
elif prompt_type == "llama2": | |
all_rows.extend( | |
[dict(input=c['text'] + | |
(" " if c['text'].rfind("[/INST]") > c['text'].rfind("[INST]") else " [/INST]"), | |
output="", prompt_type='plain', source=data_file) for c in conversations if | |
"What is H2O.ai" not in c['text']]) | |
else: | |
raise NotImplementedError | |
unhelpful = get_unhelpful_list() | |
all_rows = [x for x in all_rows if not any(u in x['input'] for u in unhelpful)] | |
personality = create_personality_data(prompt_type=prompt_type) | |
all_rows.extend(personality * 10) | |
np.random.seed(123) | |
np.random.shuffle(all_rows) | |
print(len(all_rows)) | |
if deberta_grading: | |
df = pd.DataFrame(all_rows) | |
df = df.rename(columns={'input': 'text'}) | |
df = add_deberta_grade(df) | |
df = df.rename(columns={'text': 'input'}) | |
drop = True | |
if drop: | |
min_grade = 0.3 | |
max_grade = np.inf | |
before_rows = df.shape[0] | |
df = df[df['grade_deberta'] >= min_grade] | |
df = df[df['grade_deberta'] <= max_grade] | |
after_rows = df.shape[0] | |
print("Dropped %d rows out of %d due to deberta grade" % (before_rows - after_rows, before_rows)) | |
print("After DeBERTa grade") | |
print(df.describe()) | |
all_rows = [] | |
for i in range(df.shape[0]): | |
all_rows.append( | |
dict( | |
input=df['input'].iloc[i], | |
output=df['output'].iloc[i], | |
source=df['source'].iloc[i], | |
prompt_type=df['prompt_type'].iloc[i], | |
grade_deberta=df['grade_deberta'].iloc[i], | |
) | |
) | |
if save_json: | |
data_file = data_file + \ | |
("_h2ogpt" if fixup_personality else "") + \ | |
("_only" if only_personality else "") + \ | |
("_graded" if deberta_grading else "") + \ | |
("_llama2_chat" if prompt_type == "llama2" else "") | |
for i in range(len(all_rows)): | |
all_rows[i]['id'] = i | |
with open(data_file.lower().replace("/", "_") + ".json", "w") as f: | |
f.write(json.dumps(all_rows, indent=2)) | |
return all_rows | |
def test_finalize_to_json(): | |
df = pd.read_parquet('h2oGPT.cleaned.graded.human_bot.shorter.parquet') | |
df = df.rename(columns={'text': 'input'}) | |
print("Number of high-quality human_bot interactions: %s" % df.shape[0], flush=True) | |
print("Adding open assistant data") | |
with open("openassistant_oasst1_h2ogpt_graded.json") as f: | |
open_assistant = json.loads(f.read()) | |
df = pd.concat([df, pd.DataFrame(open_assistant)], axis=0) | |
def final_clean(df): | |
from better_profanity import profanity | |
profanity.load_censor_words_from_file("data/censor_words.txt") | |
df['profanity'] = parallel_apply( | |
df['input'], | |
lambda x: profanity.contains_profanity(x), | |
n_jobs=-1, | |
) | |
return df[(df['profanity'] == 0)].reset_index(drop=True) | |
print("Before cleaning: Number of final high-quality human_bot interactions: %s" % df.shape[0], flush=True) | |
df = final_clean(df) | |
print("After cleaning: Number of final high-quality human_bot interactions: %s" % df.shape[0], flush=True) | |
print(df.describe()) | |
print(df.shape) | |
row_list = [] | |
for i in range(df.shape[0]): | |
row_list.append( | |
dict( | |
input=df.loc[i, 'input'], | |
source=df.loc[i, 'source'], | |
prompt_type='plain', | |
) | |
) | |
np.random.seed(1234) | |
np.random.shuffle(row_list) | |
unhelpful = get_unhelpful_list() | |
row_list = [x for x in row_list if not any(u in x['input'] for u in unhelpful)] | |
for i in range(len(row_list)): | |
row_list[i]['id'] = i | |
row_list[i]['input'] = row_list[i]['input'].replace(" <bot>:", "\n<bot>:") | |
with open('h2ogpt-oig-oasst1-instruct-cleaned-v3.json', "w") as f: | |
f.write(json.dumps(row_list, indent=2)) | |
def create_personality_data(prompt_type="llama2"): | |
questions = [ | |
"What's your name?", | |
"What is your name?", | |
"What are you?", | |
"Who are you?", | |
"Do you have a name?", | |
"Who trained you?", | |
"Who created you?", | |
"Who made you?", | |
] | |
answers = [ | |
"I'm h2oGPT, a large language model by H2O.ai.", | |
"I'm h2oGPT, a large language model by H2O.ai, the visionary leader in democratizing AI.", | |
"My name is h2oGPT. I'm a large language model by H2O.ai, the visionary leader in democratizing AI.", | |
"My name is h2oGPT. I'm a large language model trained by H2O.ai.", | |
"Hi! I'm h2oGPT, a large language model by H2O.ai.", | |
"Hi! I'm h2oGPT, a large language model by H2O.ai, the visionary leader in democratizing AI.", | |
] | |
help = [ | |
"", | |
" How can I help you?", | |
" How may I assist you?", | |
" Nice to meet you.", | |
] | |
import itertools | |
rows = [] | |
for pair in itertools.product(questions, answers, help): | |
rows.append( | |
dict(input=f"{pair[0]}", output=f"{pair[1]}{pair[2]}", prompt_type=prompt_type, source="H2O.ai") | |
) | |
for q, a in [ | |
("What is H2O.ai?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), | |
("What is h2o.ai?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), | |
("What is H2O?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), | |
("Who is h2o.ai?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), | |
("who is h2o.ai?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), | |
("who is h2o?", "H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models."), | |
("what is H2O.ai?", "H2O.ai is the visionary leader in democratizing AI."), | |
("who is H2O.ai?", "H2O.ai is the visionary leader in democratizing AI."), | |
("who is H2O?", "H2O.ai is the visionary leader in democratizing AI."), | |
("Who is h20?", "H2O.ai is the visionary leader in democratizing AI."), | |
]: | |
rows.append(dict(input=q, output=a, prompt_type=prompt_type, source='H2O.ai')) | |
print(len(rows)) | |
with open("h2ogpt-personality.json", "w") as f: | |
f.write(json.dumps(rows, indent=2)) | |
return rows | |
def test_check_stats_data(): | |
filename = 'h2ogpt-oig-oasst1-instruct-cleaned-v3.json' | |
df = pd.read_json(filename) | |
# get word stats | |
df['char_count'] = df['input'].apply(lambda x: len(x)) | |
import matplotlib.pyplot as plt | |
plt.figure(figsize=(10, 10)) | |
plt.hist(df['char_count'], bins=100) | |
chars_avg = np.mean(df['char_count']) | |
chars_median = np.median(df['char_count']) | |
plt.title("char_count avg: %s median: %s" % (chars_avg, chars_median)) | |
plt.savefig('chars_hist.png') | |
plt.close() | |
# get tokenize stats for random sample of 1000 rows | |
from finetune import generate_and_tokenize_prompt | |
from loaders import get_loaders, get_tokenizer | |
from functools import partial | |
llama_type = False | |
tokenizer_base_model = base_model = 'h2oai/h2ogpt-oasst1-512-20b' | |
model_loader, tokenizer_loader, conditional_type = ( | |
get_loaders(model_name=base_model, reward_type=False, llama_type=llama_type)) | |
local_files_only = False | |
resume_download = True | |
use_auth_token = False | |
tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token) | |
prompt_type = 'plain' # trained with data already in human bot form | |
train_on_inputs = True | |
add_eos_token = False | |
cutoff_len = 512 # can choose 2048 | |
generate_and_tokenize_prompt_fun = partial(generate_and_tokenize_prompt, prompt_type=prompt_type, | |
train_on_inputs=train_on_inputs, add_eos_token=add_eos_token, | |
cutoff_len=cutoff_len, tokenizer=tokenizer) | |
from datasets import load_dataset | |
data = load_dataset("json", data_files={"train": filename}) | |
val_set_size = 0.90 | |
train_val = data["train"].train_test_split( | |
test_size=val_set_size, shuffle=True, seed=42 | |
) | |
train_data = train_val["train"] | |
train_data = train_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count()) | |
df_tokens = pd.DataFrame([len(x) for x in train_data['input_ids']], columns=['token_count']) | |
plt.figure(figsize=(10, 10)) | |
plt.hist(df_tokens['token_count'], bins=100) | |
token_avg = np.mean(df_tokens['token_count']) | |
token_median = np.median(df_tokens['token_count']) | |
plt.title("token_count with cutoff=%s avg: %s median: %s" % (cutoff_len, token_avg, token_median)) | |
plt.savefig('token_hist_%s.png' % cutoff_len) | |
plt.close() | |
def get_unhelpful_list(): | |
# base versions | |
unhelpful = ["I'm sorry, I didn't quite understand your question, could you please rephrase it?", | |
"I'm sorry, but I don't understand your question. Could you please rephrase it?", | |
"I'm sorry, I don't quite understand your question", | |
"I'm sorry, I don't know", | |
"I'm sorry, but I don't know", | |
"I don't know anything", | |
"I do not know", | |
"I don't know", | |
"I don't know how", | |
"I do not know how", | |
"Can you please explain what you mean", | |
"please explain what you mean", | |
"please explain", | |
"I'm sorry, but I don't know how to tell a story. Can you please explain what you mean by", | |
"I'm sorry but I don't understand what you mean", | |
"I don't understand", | |
"I don't have the ability", | |
"I do not have the ability", | |
"I do not have", | |
"I am a language model,", | |
"I am a large language model,", | |
"I do not understand your question. Can you please try to make it clearer?", | |
"I'm sorry, but as an AI language model", | |
"I apologize, but I cannot rephrase text that I cannot understand. Your post is difficult to read and follow.", | |
"I apologize, but I am not h2oGPT. I am a language model developed by H2O.ai. How may I help you?", | |
"Sorry, but I am not an actual Linux shell, nor am I capable of emulating one. I am an open source chat assistant and would be glad t", | |
"I apologize, but I cannot perform the task you have requested.", | |
"I'm sorry, I cannot perform this task as I am an AI language model and do not have access", | |
"I'm sorry, I'm not sure what you're asking for here.", | |
"I'm not sure what you are asking", | |
"You need to provide more context", | |
] | |
# reduced versions, with redundant parts, just to give context for where they came from | |
unhelpful += ["sorry, I didn't quite understand your question", | |
"I didn't quite understand your question", | |
"I didn't understand your question", | |
"I did not understand your question", | |
"I did not understand the question", | |
"could you please rephrase" | |
"could you rephrase" | |
"I do not understand your question.", | |
"I do not understand the question.", | |
"I do not understand that question.", | |
"Can you please try to make it clearer", | |
"Can you try to make it clearer", | |
"sorry, but as an AI language model", | |
"as an AI language model", | |
"I apologize, but I cannot", | |
"I cannot rephrase text", | |
"I cannot understand. Your post is difficult to read and follow." | |
"Your post is difficult to read and follow." | |
"I apologize, but I am", | |
"Sorry, but I am not ", | |
"nor am I capable", | |
"I am not capable of", | |
"I apologize, but I cannot perform the task you have requested", | |
"I cannot perform the task", | |
"I cannot complete the task", | |
"I'm sorry", | |
"I am sorry", | |
"do not have access", | |
"not sure what you're asking for", | |
"not sure what you are asking for", | |
"not sure what is being asked", | |
"I'm not sure what you are asking", | |
"not sure what you are asking", | |
"You need to provide more context", | |
"provide more context", | |
] | |
unhelpful += ["As a large language model", | |
"cannot provide any information", | |
"As an artificial intelligence I do not have the capability", | |
"As an artificial intelligence I don't have the capability", | |
"As an artificial intelligence I can't", | |
"As an artificial intelligence I cannot", | |
"I am sorry but I do not understand", | |
"Can you please explain", | |
"(sorry couldn't resist)", | |
"(sorry could not resist)", | |
" :)", | |
" ;)", | |
" :-)", | |
" ;-)", | |
" lol ", | |
"Thanks so much!!!", | |
"Thank You :)!!!", | |
"Please try not to repeat", | |
"I am an AI language model", | |
"I'm a AI assistant that", | |
"I'm an AI assistant that", | |
"I am an AI assistant that", | |
"etc.", | |
"etc.etc.", | |
"etc. etc.", | |
"etc etc", | |
] | |
return unhelpful | |
def test_check_unhelpful(): | |
# file = '/home/jon/Downloads/openassistant_oasst1_h2ogpt_graded.json' | |
file = '/home/jon/Downloads/openassistant_oasst1_h2ogpt_grades.json' | |
# file = 'h2ogpt-oig-oasst1-instruct-cleaned-v2.json' | |
unhelpful = get_unhelpful_list() | |
# data = json.load(open(file, 'rt')) | |
df = pd.read_json(file) | |
use_reward_score_threshold = False | |
use_bleu_threshold = False | |
use_sentence_sim = True | |
from sacrebleu.metrics import BLEU | |
bleu = BLEU() | |
from nltk.translate.bleu_score import sentence_bleu | |
def get_bleu(actual, expected_list): | |
# return bleu.sentence_score(actual, expected_list).score | |
return sentence_bleu(expected_list, actual) | |
threshold = 0.0 | |
if use_reward_score_threshold: | |
df = df[df['grade_deberta'] > threshold] | |
# back to as if original json load | |
data = df.to_dict(orient='records') | |
bads = {} | |
string_all = str(data) | |
for sub in unhelpful: | |
bads[sub] = string_all.count(sub) | |
bads = {k: v for k, v in bads.items() if v > 0} | |
import pprint | |
pp = pprint.PrettyPrinter(indent=4) | |
pp.pprint(bads) | |
total_bads = sum(list(bads.values())) | |
print('total_bads: %s' % total_bads, flush=True) | |
# check just bot | |
import re | |
convs = [[x.strip() for x in re.split(r'%s|%s' % (human, bot), y['input']) if x.strip()] for y in data] | |
humans = [[x for i, x in enumerate(y) if i % 2 == 0] for y in convs] | |
bots = [[x for i, x in enumerate(y) if i % 2 == 1] for y in convs] | |
# FIXME: apply back to json etc., just see for now | |
bleu_threshold = 0.9 | |
if use_bleu_threshold: | |
bots = [[x for x in y if get_bleu(x, unhelpful) < bleu_threshold] for y in tqdm(bots)] | |
cosine_sim_threshold = 0.8 | |
if use_sentence_sim: | |
# pip install sentence_transformers-2.2.2 | |
from sentence_transformers import SentenceTransformer | |
# sent_model = 'bert-base-nli-mean-tokens' | |
# sent_model = 'nli-distilroberta-base-v2' | |
sent_model = 'all-MiniLM-L6-v2' | |
model = SentenceTransformer(sent_model) | |
sentence_embeddings = model.encode(unhelpful) | |
from sklearn.metrics.pairwise import cosine_similarity | |
bots = [x for x in tqdm(bots) if | |
np.max(cosine_similarity(model.encode(x), sentence_embeddings)) < cosine_sim_threshold] | |
bads_bots = {} | |
string_all = str(bots) | |
for sub in unhelpful: | |
bads_bots[sub] = string_all.count(sub) | |
bads_bots = {k: v for k, v in bads_bots.items() if v > 0} | |
import pprint | |
pp = pprint.PrettyPrinter(indent=4) | |
pp.pprint(bads_bots) | |
total_bads_bots = sum(list(bads_bots.values())) | |
print('threshold: %g use_bleu_threshold: %g total_bads_bots: %s total_bots: %s total_humans: %s' % ( | |
threshold, use_bleu_threshold, total_bads_bots, len(bots), len(humans)), flush=True) | |
# assert len(bads) == 0, bads | |
assert len(bads_bots) == 0, bads_bots | |
def test_fortune2000_personalized(): | |
row_list = [] | |
import glob | |
if not os.path.isdir("wikitext"): | |
raise RuntimeError("download https://github.com/h2oai/h2ogpt/files/11423008/wikitext.zip and unzip") | |
for file in glob.glob("wikitext/*.txt"): | |
with open(file, "r") as f: | |
blob = f.read() | |
N = 512 * 4 | |
row_list.extend([{'input': s, 'prompt_type': 'plain', 'source': "%s" % os.path.basename(file)} | |
for s in get_sentences(blob, N) if s]) | |
personality = create_personality_data() | |
import copy | |
for i in range(10): | |
row_list.extend(copy.deepcopy(personality)) | |
np.random.seed(123) | |
np.random.shuffle(row_list) | |
for i in range(len(row_list)): | |
row_list[i]['id'] = i | |
for i in range(len(row_list)): | |
assert row_list[i]['id'] == i | |
with open("h2ogpt-fortune2000-personalized.json", "w") as ff: | |
ff.write(json.dumps(row_list, indent=2)) | |