diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index 615bf067f9675c7c925c97a3b9d002bca62ef54f..0000000000000000000000000000000000000000 --- a/.gitattributes +++ /dev/null @@ -1,87 +0,0 @@ -data/datasets/lilac/pile-of-law-constitutions/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/pile-of-law-constitutions/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/pile-of-law-constitutions/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/pile-of-law-constitutions/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/piqa/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/piqa/goal/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/piqa/goal/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/piqa/goal/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/piqa/sol1/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/piqa/sol1/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/piqa/sol1/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/piqa/sol2/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/pile-of-law-atticus-contracts/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/question/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/pile-of-law-r-legaladvice/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/pile-of-law-r-legaladvice/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/pile-of-law-r-legaladvice/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/pile-of-law-r-legaladvice/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/pile-of-law-r-legaladvice/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/pile-of-law-r-legaladvice/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/open-asssistant-conversations/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/open-asssistant-conversations/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/open-asssistant-conversations/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/answers/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/answers/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/answers/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/answers/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/answers/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/context/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/context/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/context/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/context/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/context/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/context/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/context/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/context/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/question/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/question/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/question/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/question/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/squad_v2/question/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/imdb/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/imdb/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/imdb/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/imdb/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/imdb/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/imdb/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/wikitext-2-raw-v1/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/wikitext-2-raw-v1/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/wikitext-2-raw-v1/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/wikitext-2-raw-v1/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/wikitext-2-raw-v1/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text -data/datasets/lilac/wikitext-2-raw-v1/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text diff --git a/data/datasets/lilac/imdb/config.yml b/data/datasets/lilac/imdb/config.yml deleted file mode 100644 index 73e9df5d09dd8107a938e6101096b0f846e328b0..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/config.yml +++ /dev/null @@ -1,31 +0,0 @@ -embeddings: -- embedding: gte-small - path: text -name: imdb -namespace: local -settings: - preferred_embedding: gte-small - ui: - media_paths: - - text -signals: -- path: text - signal: - signal_name: near_dup -- path: text - signal: - signal_name: text_statistics -- path: text - signal: - signal_name: lang_detection -- path: text - signal: - signal_name: spacy_ner -- path: text - signal: - signal_name: pii -source: - dataset_name: imdb - source_name: huggingface -tags: -- machine-learning diff --git a/data/datasets/lilac/imdb/data-00000-of-00001.parquet b/data/datasets/lilac/imdb/data-00000-of-00001.parquet deleted file mode 100644 index 7de63f25ce8a56ffcdd5d13481936d94b0bf1114..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6fe90be23f86ca1e73b8a77a235344db822601f794a5643dca9d0d07c49ce3d8 -size 86160450 diff --git a/data/datasets/lilac/imdb/manifest.json b/data/datasets/lilac/imdb/manifest.json deleted file mode 100644 index c31e03ff0e29009dadc31f58a3651126532ba9a2..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/manifest.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "data_schema": { - "fields": { - "text": { - "dtype": "string" - }, - "label": { - "dtype": "string" - }, - "__hfsplit__": { - "dtype": "string" - }, - "__rowid__": { - "dtype": "string" - } - } - } -} \ No newline at end of file diff --git a/data/datasets/lilac/imdb/text/gte-small/hnsw.hnswlib.bin b/data/datasets/lilac/imdb/text/gte-small/hnsw.hnswlib.bin deleted file mode 100644 index 7c55dfabe007419c6d7b9bf8b5f2802db050d614..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/text/gte-small/hnsw.hnswlib.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7b8fc440b947f068966bc68c83ce4b5502b98e3aab928d58c295dc8a7b7b016c -size 691432396 diff --git a/data/datasets/lilac/imdb/text/gte-small/hnsw.lookup.pkl b/data/datasets/lilac/imdb/text/gte-small/hnsw.lookup.pkl deleted file mode 100644 index 53b11645b72ae8b2ac0a7d4f8d3911ab8c38d54f..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/text/gte-small/hnsw.lookup.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d3ee2426c761b50025e02f2388fa65c51eb9705d6fd65f95d6502650214b8472 -size 10390867 diff --git a/data/datasets/lilac/imdb/text/gte-small/signal_manifest.json b/data/datasets/lilac/imdb/text/gte-small/signal_manifest.json deleted file mode 100644 index 6c4df1308025f01afd7ad11c2e4eab36963c61b0..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/text/gte-small/signal_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "files": [], - "parquet_id": "gte-small(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "gte-small": { - "repeated_field": { - "fields": { - "embedding": { - "dtype": "embedding" - } - }, - "dtype": "string_span" - }, - "signal": { - "signal_name": "gte-small" - } - } - } - } - } - }, - "signal": { - "signal_name": "gte-small" - }, - "enriched_path": [ - "text" - ], - "vector_store": "hnsw" -} \ No newline at end of file diff --git a/data/datasets/lilac/imdb/text/gte-small/spans.pkl b/data/datasets/lilac/imdb/text/gte-small/spans.pkl deleted file mode 100644 index 3e11946858f1457913e6325909c7ef4c4627aadd..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/text/gte-small/spans.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:16417449057fc4304e098e037cb2b3e9a693570768a68bdca457d452adaee130 -size 7476546 diff --git a/data/datasets/lilac/imdb/text/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/imdb/text/lang_detection/data-00000-of-00001.parquet deleted file mode 100644 index 74adfd2807c8751cdadb71fcb56559359e0e17ac..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/text/lang_detection/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:adba6df16ec47b68625618b01a0b8d2cc65de572acb20eed561128b037fcdfd7 -size 3309315 diff --git a/data/datasets/lilac/imdb/text/lang_detection/signal_manifest.json b/data/datasets/lilac/imdb/text/lang_detection/signal_manifest.json deleted file mode 100644 index 2163e9fd9535885346193cbb0e9772d01841cd31..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/text/lang_detection/signal_manifest.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "lang_detection(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "lang_detection": { - "dtype": "string", - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - } - } - } - } - } - }, - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet deleted file mode 100644 index 35392cacc82b31fc8f031326450153f1bbd0c0f9..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0c75a5f0d12b8b02671e99a2dd313d292610c9264f2302ad1877959430420079 -size 3915752 diff --git a/data/datasets/lilac/imdb/text/near_dup/signal_manifest.json b/data/datasets/lilac/imdb/text/near_dup/signal_manifest.json deleted file mode 100644 index 6febd3e69b9125b0be3b4778f7b65fab8a5d325b..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/text/near_dup/signal_manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "near_dup(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "near_dup": { - "fields": { - "cluster_id": { - "dtype": "uint32", - "categorical": true - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - } - } - } - } - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet b/data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet deleted file mode 100644 index 1c7fdb96547b73a53ccbc529c6fbcedd09998938..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2dfe71845bfd419a2a309b132456341bf317d63e8ccf8bc100835f1a20c81c5b -size 3313701 diff --git a/data/datasets/lilac/imdb/text/pii/signal_manifest.json b/data/datasets/lilac/imdb/text/pii/signal_manifest.json deleted file mode 100644 index ecb2e8b26fb5328004c7cd7b5363b79e88acf8ac..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/text/pii/signal_manifest.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "pii(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "pii": { - "fields": { - "emails": { - "repeated_field": { - "dtype": "string_span" - } - }, - "ip_addresses": { - "repeated_field": { - "dtype": "string_span" - } - }, - "secrets": { - "repeated_field": { - "dtype": "string_span" - } - } - }, - "signal": { - "signal_name": "pii" - } - } - } - } - } - }, - "signal": { - "signal_name": "pii" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/imdb/text/spacy_ner/data-00000-of-00001.parquet b/data/datasets/lilac/imdb/text/spacy_ner/data-00000-of-00001.parquet deleted file mode 100644 index 7d35f24b7a959412cd9a0c2b33774b9ed37b80c2..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/text/spacy_ner/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:be39eb80c981da64a2ec5f7fe42c43ffb390b887a220e278a58cdde04e7824ef -size 8479478 diff --git a/data/datasets/lilac/imdb/text/spacy_ner/signal_manifest.json b/data/datasets/lilac/imdb/text/spacy_ner/signal_manifest.json deleted file mode 100644 index 56c8c0883131d3cf35156969931918e4ea2da3a6..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/text/spacy_ner/signal_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "spacy_ner(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "spacy_ner": { - "repeated_field": { - "fields": { - "label": { - "dtype": "string" - } - }, - "dtype": "string_span" - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - } - } - } - } - } - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet deleted file mode 100644 index 945b09c374c6967dbb6bc9074a3d1c271724f334..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4919817d0c520fc5aacc9721f01a0e29fb4d2c89b6698865c9680c81c44ce920 -size 4403809 diff --git a/data/datasets/lilac/imdb/text/text_statistics/signal_manifest.json b/data/datasets/lilac/imdb/text/text_statistics/signal_manifest.json deleted file mode 100644 index 642e3171e68afe580d2043a5f4a9f1331fc85208..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/imdb/text/text_statistics/signal_manifest.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "text_statistics(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "text_statistics": { - "fields": { - "num_characters": { - "dtype": "int32" - }, - "readability": { - "dtype": "float32" - }, - "log(type_token_ratio)": { - "dtype": "float32" - }, - "frac_non_ascii": { - "dtype": "float32", - "bins": [ - [ - "Low", - null, - 0.15 - ], - [ - "Medium", - 0.15, - 0.3 - ], - [ - "High", - 0.3, - null - ] - ] - } - }, - "signal": { - "signal_name": "text_statistics" - } - } - } - } - } - }, - "signal": { - "signal_name": "text_statistics" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin b/data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin deleted file mode 100644 index bb52449d84416dc773f1df977ecc9babdee5ad6b..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:df9d6e2f5df4b8693544f31ca78a9d1936a4caf47acc2babeb1cb766131b7636 -size 684360968 diff --git a/data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl b/data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl deleted file mode 100644 index 550767a8331b0368aa657fb61e7cac286b3a225e..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2081ce5d760026fe341e0553cd9e40747ca902e4e7edb851cb747f350f19bb0d -size 11174465 diff --git a/data/datasets/lilac/mmlu_professional_law/choices/gte-small/signal_manifest.json b/data/datasets/lilac/mmlu_professional_law/choices/gte-small/signal_manifest.json deleted file mode 100644 index 6c04490217303a73516da9928f8e598115a3cb37..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/choices/gte-small/signal_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "files": [], - "parquet_id": "gte-small(choices)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "choices": { - "repeated_field": { - "fields": { - "gte-small": { - "repeated_field": { - "fields": { - "embedding": { - "dtype": "embedding" - } - }, - "dtype": "string_span" - }, - "signal": { - "signal_name": "gte-small" - } - } - } - } - } - } - }, - "signal": { - "signal_name": "gte-small" - }, - "enriched_path": [ - "choices", - "*" - ], - "vector_store": "hnsw" -} \ No newline at end of file diff --git a/data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl b/data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl deleted file mode 100644 index a8a3c3a2e60b5a6b6ee315f06cf24a8ec15c4588..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:02fb1662da21f33ea1429a0f9adf1301185da46f642a722717fe7c523314fa57 -size 11173475 diff --git a/data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet deleted file mode 100644 index f784fc68538c4d3a5546066cf36efb2a2874125f..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:968d4f87c7b51b995d9e3a96423a06b91984e5ee4a47062cd53fe87cca5cafbe -size 3469413 diff --git a/data/datasets/lilac/mmlu_professional_law/choices/lang_detection/signal_manifest.json b/data/datasets/lilac/mmlu_professional_law/choices/lang_detection/signal_manifest.json deleted file mode 100644 index 132c9444fdce498203606a96b8b38e3d97cbac7d..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/choices/lang_detection/signal_manifest.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "lang_detection(choices)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "choices": { - "repeated_field": { - "fields": { - "lang_detection": { - "dtype": "string", - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - } - } - } - } - } - } - }, - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - }, - "enriched_path": [ - "choices", - "*" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet deleted file mode 100644 index d22aa7cf65a7a6f3dc4a5c2e7e05db5bdde6f6fe..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:316f3be499fcbb960bc1e83a201838ca0b3047a71d8e1c302fe4e0d833a3bf90 -size 5544176 diff --git a/data/datasets/lilac/mmlu_professional_law/choices/near_dup/signal_manifest.json b/data/datasets/lilac/mmlu_professional_law/choices/near_dup/signal_manifest.json deleted file mode 100644 index f39b2f7882b80a1a9b55fe5f6d6e6ded78c7013c..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/choices/near_dup/signal_manifest.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "near_dup(choices)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "choices": { - "repeated_field": { - "fields": { - "near_dup": { - "fields": { - "cluster_id": { - "dtype": "uint32", - "categorical": true - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - } - } - } - } - } - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - }, - "enriched_path": [ - "choices", - "*" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet b/data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet deleted file mode 100644 index db00f582f70626ec2204613e14872306f5272f5d..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7cb41d4e9d0d82bd824abfa733d5be3a599e011098c5d41ebadeb1166a15f722 -size 3393096 diff --git a/data/datasets/lilac/mmlu_professional_law/choices/pii/signal_manifest.json b/data/datasets/lilac/mmlu_professional_law/choices/pii/signal_manifest.json deleted file mode 100644 index 901a0d06bbe458a729ec40a278e59267bb54a949..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/choices/pii/signal_manifest.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "pii(choices)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "choices": { - "repeated_field": { - "fields": { - "pii": { - "fields": { - "emails": { - "repeated_field": { - "dtype": "string_span" - } - }, - "ip_addresses": { - "repeated_field": { - "dtype": "string_span" - } - }, - "secrets": { - "repeated_field": { - "dtype": "string_span" - } - } - }, - "signal": { - "signal_name": "pii" - } - } - } - } - } - } - }, - "signal": { - "signal_name": "pii" - }, - "enriched_path": [ - "choices", - "*" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/data-00000-of-00001.parquet b/data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/data-00000-of-00001.parquet deleted file mode 100644 index 402e4fc9557e6641a7beb3af3454479894659e6b..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4b1255490f17c64f88b8b332c7df30060df612b9de11b17aaf6f70234c363e1e -size 4080744 diff --git a/data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/signal_manifest.json b/data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/signal_manifest.json deleted file mode 100644 index 974489ffd90d2ef30689acdf5c699671593e1c26..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/signal_manifest.json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "spacy_ner(choices)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "choices": { - "repeated_field": { - "fields": { - "spacy_ner": { - "repeated_field": { - "fields": { - "label": { - "dtype": "string" - } - }, - "dtype": "string_span" - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - } - } - } - } - } - } - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - }, - "enriched_path": [ - "choices", - "*" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet deleted file mode 100644 index 995c8e0124c6daaf85c0e3b2d3fc59ce01f989ed..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bc00a68e0f835e25b214d90e7e48251b39d748f1e836af713440cd0ea2517ead -size 4634821 diff --git a/data/datasets/lilac/mmlu_professional_law/choices/text_statistics/signal_manifest.json b/data/datasets/lilac/mmlu_professional_law/choices/text_statistics/signal_manifest.json deleted file mode 100644 index 710776d4bdd3983e70fa8797d558337eb07de097..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/choices/text_statistics/signal_manifest.json +++ /dev/null @@ -1,62 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "text_statistics(choices)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "choices": { - "repeated_field": { - "fields": { - "text_statistics": { - "fields": { - "num_characters": { - "dtype": "int32" - }, - "readability": { - "dtype": "float32" - }, - "log(type_token_ratio)": { - "dtype": "float32" - }, - "frac_non_ascii": { - "dtype": "float32", - "bins": [ - [ - "Low", - null, - 0.15 - ], - [ - "Medium", - 0.15, - 0.3 - ], - [ - "High", - 0.3, - null - ] - ] - } - }, - "signal": { - "signal_name": "text_statistics" - } - } - } - } - } - } - }, - "signal": { - "signal_name": "text_statistics" - }, - "enriched_path": [ - "choices", - "*" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/mmlu_professional_law/config.yml b/data/datasets/lilac/mmlu_professional_law/config.yml deleted file mode 100644 index cc20ff4f60cf5075a5410dae0e1eb5db2d922053..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/config.yml +++ /dev/null @@ -1,63 +0,0 @@ -embeddings: -- embedding: gte-small - path: - - choices - - '*' -- embedding: gte-small - path: question -name: mmlu_professional_law -namespace: local -settings: - preferred_embedding: gte-small - ui: - media_paths: - - question - - - choices - - '*' -signals: -- path: question - signal: - signal_name: text_statistics -- path: question - signal: - signal_name: pii -- path: question - signal: - signal_name: near_dup -- path: - - choices - - '*' - signal: - signal_name: text_statistics -- path: - - choices - - '*' - signal: - signal_name: spacy_ner -- path: question - signal: - signal_name: lang_detection -- path: - - choices - - '*' - signal: - signal_name: near_dup -- path: - - choices - - '*' - signal: - signal_name: pii -- path: - - choices - - '*' - signal: - signal_name: lang_detection -- path: question - signal: - signal_name: spacy_ner -source: - config_name: professional_law - dataset_name: cais/mmlu - source_name: huggingface -tags: -- legal diff --git a/data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet b/data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet deleted file mode 100644 index 72c4ca5a5b0858d72e2a5d89b3926426c70243be..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:65cd2771cf0bb88dbed9ad66ceaff472115f07c9dfea866c7e3f65b68392e745 -size 50699938 diff --git a/data/datasets/lilac/mmlu_professional_law/manifest.json b/data/datasets/lilac/mmlu_professional_law/manifest.json deleted file mode 100644 index 2a8c653bebcf792e4fb958593f3b2fd8e5e38492..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/manifest.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "data_schema": { - "fields": { - "question": { - "dtype": "string" - }, - "choices": { - "repeated_field": { - "dtype": "string" - } - }, - "answer": { - "dtype": "string" - }, - "__hfsplit__": { - "dtype": "string" - }, - "__rowid__": { - "dtype": "string" - } - } - } -} \ No newline at end of file diff --git a/data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin b/data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin deleted file mode 100644 index 4569a02af01da1f1634eeb504a906bb583dcdb17..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3b02300405fccc3011294e15ee869933dd81578173435defbcb19e3b40a65e93 -size 771802212 diff --git a/data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl b/data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl deleted file mode 100644 index e16a5c06f3590404cf63126390e80ac14f0aaab1..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f72169740d80ee2b2ea66589d7ebcc58c83381978a4640a27510c416a02bf6c7 -size 11296648 diff --git a/data/datasets/lilac/mmlu_professional_law/question/gte-small/signal_manifest.json b/data/datasets/lilac/mmlu_professional_law/question/gte-small/signal_manifest.json deleted file mode 100644 index 70228c5da7472862dc18b8642dc253faba48c558..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/question/gte-small/signal_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "files": [], - "parquet_id": "gte-small(question)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "question": { - "fields": { - "gte-small": { - "repeated_field": { - "fields": { - "embedding": { - "dtype": "embedding" - } - }, - "dtype": "string_span" - }, - "signal": { - "signal_name": "gte-small" - } - } - } - } - } - }, - "signal": { - "signal_name": "gte-small" - }, - "enriched_path": [ - "question" - ], - "vector_store": "hnsw" -} \ No newline at end of file diff --git a/data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl b/data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl deleted file mode 100644 index 2c191ec72cf74fe65597763ee0fd9267c75677cd..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9b51cad455e94b167bc9cf130c262ed1b143a8f386c7074a61983e01cd93d277 -size 7911602 diff --git a/data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet deleted file mode 100644 index 23b1a6a53511c3187c805eee93017682a06d221b..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf6cf8cdc246ce4406599aec8782d3be02f2585f1fbad74173faf0ffcb453a49 -size 3361922 diff --git a/data/datasets/lilac/mmlu_professional_law/question/lang_detection/signal_manifest.json b/data/datasets/lilac/mmlu_professional_law/question/lang_detection/signal_manifest.json deleted file mode 100644 index 973db6d92666ad6b4e67edd70b041618e1ed5be7..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/question/lang_detection/signal_manifest.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "lang_detection(question)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "question": { - "fields": { - "lang_detection": { - "dtype": "string", - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - } - } - } - } - } - }, - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - }, - "enriched_path": [ - "question" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet deleted file mode 100644 index 8fcde5a91f92f7d942777bca0dacda105d6677b3..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c4139f699d1a248cf5378c442ef6f17970913394d5d0c79bd7c6e6801ab548a -size 3697516 diff --git a/data/datasets/lilac/mmlu_professional_law/question/near_dup/signal_manifest.json b/data/datasets/lilac/mmlu_professional_law/question/near_dup/signal_manifest.json deleted file mode 100644 index 3da5c66d3d35050bd9590d6ee73d0107dfc6d788..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/question/near_dup/signal_manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "near_dup(question)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "question": { - "fields": { - "near_dup": { - "fields": { - "cluster_id": { - "dtype": "uint32", - "categorical": true - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - } - } - } - } - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - }, - "enriched_path": [ - "question" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet b/data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet deleted file mode 100644 index 9d23c418b1121440a401cb356ee95ca4a2ef4b28..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2735c4a2c5d40973652d369140533af74425db6dd753f8a25850d4efeee4928e -size 3369080 diff --git a/data/datasets/lilac/mmlu_professional_law/question/pii/signal_manifest.json b/data/datasets/lilac/mmlu_professional_law/question/pii/signal_manifest.json deleted file mode 100644 index 7f82232c8066c00959e30503f41c1d852627a812..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/question/pii/signal_manifest.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "pii(question)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "question": { - "fields": { - "pii": { - "fields": { - "emails": { - "repeated_field": { - "dtype": "string_span" - } - }, - "ip_addresses": { - "repeated_field": { - "dtype": "string_span" - } - }, - "secrets": { - "repeated_field": { - "dtype": "string_span" - } - } - }, - "signal": { - "signal_name": "pii" - } - } - } - } - } - }, - "signal": { - "signal_name": "pii" - }, - "enriched_path": [ - "question" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/mmlu_professional_law/question/spacy_ner/data-00000-of-00001.parquet b/data/datasets/lilac/mmlu_professional_law/question/spacy_ner/data-00000-of-00001.parquet deleted file mode 100644 index 293305bfe2d3e259d904b881e20c7738b947dac7..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/question/spacy_ner/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e775b663f9a3b7c7ebdd31f9a860254dec31c18aa46c5a61820050d0556cbb0f -size 9105982 diff --git a/data/datasets/lilac/mmlu_professional_law/question/spacy_ner/signal_manifest.json b/data/datasets/lilac/mmlu_professional_law/question/spacy_ner/signal_manifest.json deleted file mode 100644 index 44045a52264b58737ad4ef9def2f7194a481a762..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/question/spacy_ner/signal_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "spacy_ner(question)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "question": { - "fields": { - "spacy_ner": { - "repeated_field": { - "fields": { - "label": { - "dtype": "string" - } - }, - "dtype": "string_span" - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - } - } - } - } - } - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - }, - "enriched_path": [ - "question" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet deleted file mode 100644 index 7a614f56286007faeb3212c67d97bce12ebc3da8..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:995b3ac42907ea244d9cb04c68a4715af8ddb7d72dcced056bc58dc9a9f05e7e -size 4389031 diff --git a/data/datasets/lilac/mmlu_professional_law/question/text_statistics/signal_manifest.json b/data/datasets/lilac/mmlu_professional_law/question/text_statistics/signal_manifest.json deleted file mode 100644 index e52fe6eef5bd2c1f65350e1b161e182aec82e7f6..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/mmlu_professional_law/question/text_statistics/signal_manifest.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "text_statistics(question)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "question": { - "fields": { - "text_statistics": { - "fields": { - "num_characters": { - "dtype": "int32" - }, - "readability": { - "dtype": "float32" - }, - "log(type_token_ratio)": { - "dtype": "float32" - }, - "frac_non_ascii": { - "dtype": "float32", - "bins": [ - [ - "Low", - null, - 0.15 - ], - [ - "Medium", - 0.15, - 0.3 - ], - [ - "High", - 0.3, - null - ] - ] - } - }, - "signal": { - "signal_name": "text_statistics" - } - } - } - } - } - }, - "signal": { - "signal_name": "text_statistics" - }, - "enriched_path": [ - "question" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/open-asssistant-conversations/config.yml b/data/datasets/lilac/open-asssistant-conversations/config.yml deleted file mode 100644 index d0184048316f7bb319f07185adbb94cad4409764..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/config.yml +++ /dev/null @@ -1,31 +0,0 @@ -embeddings: -- embedding: gte-small - path: text -name: open-asssistant-conversations -namespace: local -settings: - preferred_embedding: gte-small - ui: - media_paths: - - text -signals: -- path: text - signal: - signal_name: text_statistics -- path: text - signal: - signal_name: lang_detection -- path: text - signal: - signal_name: near_dup -- path: text - signal: - signal_name: spacy_ner -- path: text - signal: - signal_name: pii -source: - dataset_name: OpenAssistant/oasst1 - source_name: huggingface -tags: -- machine-learning diff --git a/data/datasets/lilac/open-asssistant-conversations/data-00000-of-00001.parquet b/data/datasets/lilac/open-asssistant-conversations/data-00000-of-00001.parquet deleted file mode 100644 index dc3a623c724e5f9fa2df301daf5b58fbe4b91485..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d53dbedb539cf7fa3f89d739f698fd3ccf1fbbd86dac20bd0b74cf674cc508e8 -size 42071566 diff --git a/data/datasets/lilac/open-asssistant-conversations/manifest.json b/data/datasets/lilac/open-asssistant-conversations/manifest.json deleted file mode 100644 index a7f6735e9a780150f779c9c133c09f08bc8eb19b..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/manifest.json +++ /dev/null @@ -1,118 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "data_schema": { - "fields": { - "message_id": { - "dtype": "string" - }, - "parent_id": { - "dtype": "string" - }, - "user_id": { - "dtype": "string" - }, - "created_date": { - "dtype": "string" - }, - "text": { - "dtype": "string" - }, - "role": { - "dtype": "string" - }, - "lang": { - "dtype": "string" - }, - "review_count": { - "dtype": "int32" - }, - "review_result": { - "dtype": "boolean" - }, - "deleted": { - "dtype": "boolean" - }, - "rank": { - "dtype": "int32" - }, - "synthetic": { - "dtype": "boolean" - }, - "model_name": { - "dtype": "string" - }, - "detoxify": { - "fields": { - "toxicity": { - "dtype": "float64" - }, - "severe_toxicity": { - "dtype": "float64" - }, - "obscene": { - "dtype": "float64" - }, - "identity_attack": { - "dtype": "float64" - }, - "insult": { - "dtype": "float64" - }, - "threat": { - "dtype": "float64" - }, - "sexual_explicit": { - "dtype": "float64" - } - } - }, - "message_tree_id": { - "dtype": "string" - }, - "tree_state": { - "dtype": "string" - }, - "emojis": { - "fields": { - "name": { - "repeated_field": { - "dtype": "string" - } - }, - "count": { - "repeated_field": { - "dtype": "int32" - } - } - } - }, - "labels": { - "fields": { - "name": { - "repeated_field": { - "dtype": "string" - } - }, - "value": { - "repeated_field": { - "dtype": "float64" - } - }, - "count": { - "repeated_field": { - "dtype": "int32" - } - } - } - }, - "__hfsplit__": { - "dtype": "string" - }, - "__rowid__": { - "dtype": "string" - } - } - } -} \ No newline at end of file diff --git a/data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.hnswlib.bin b/data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.hnswlib.bin deleted file mode 100644 index 32d02a95613ae3b99cc663122401df64d1c50038..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.hnswlib.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:138c8efe1e911904c3702c582b892acc8c5616062a35773c31872a8969e2badf -size 327991072 diff --git a/data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.lookup.pkl b/data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.lookup.pkl deleted file mode 100644 index 39c725c45331d117370d4b9a8e5994c75ce3728e..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.lookup.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f8bcd3f617d324acd7e13d0d0fabd38065012bea40141579e16681bcdfdcaf46 -size 6171232 diff --git a/data/datasets/lilac/open-asssistant-conversations/text/gte-small/signal_manifest.json b/data/datasets/lilac/open-asssistant-conversations/text/gte-small/signal_manifest.json deleted file mode 100644 index 6c4df1308025f01afd7ad11c2e4eab36963c61b0..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/text/gte-small/signal_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "files": [], - "parquet_id": "gte-small(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "gte-small": { - "repeated_field": { - "fields": { - "embedding": { - "dtype": "embedding" - } - }, - "dtype": "string_span" - }, - "signal": { - "signal_name": "gte-small" - } - } - } - } - } - }, - "signal": { - "signal_name": "gte-small" - }, - "enriched_path": [ - "text" - ], - "vector_store": "hnsw" -} \ No newline at end of file diff --git a/data/datasets/lilac/open-asssistant-conversations/text/gte-small/spans.pkl b/data/datasets/lilac/open-asssistant-conversations/text/gte-small/spans.pkl deleted file mode 100644 index 478681332a8a920e16a7503ab303e4d7d2ebdd3b..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/text/gte-small/spans.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b1cda70a8dc3259ff058e5e3ffc24cfbaaafe3fb9ba5c1b836e0757180114e28 -size 5164058 diff --git a/data/datasets/lilac/open-asssistant-conversations/text/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/open-asssistant-conversations/text/lang_detection/data-00000-of-00001.parquet deleted file mode 100644 index 1d410264dedd48599cc425eb9fb1f954b0b65b90..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/text/lang_detection/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5f8944421e23764080d8fde7460d08aa683ebbafc6fad2bd65654ea701ba50ca -size 2980981 diff --git a/data/datasets/lilac/open-asssistant-conversations/text/lang_detection/signal_manifest.json b/data/datasets/lilac/open-asssistant-conversations/text/lang_detection/signal_manifest.json deleted file mode 100644 index 2163e9fd9535885346193cbb0e9772d01841cd31..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/text/lang_detection/signal_manifest.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "lang_detection(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "lang_detection": { - "dtype": "string", - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - } - } - } - } - } - }, - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet deleted file mode 100644 index e11365f402ba9c9d9aa8485a7d0a64a9420c7e9a..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e2949cfca1b91bb99c56364fdb47679301b90d1f51bd1963f04fbbcbe093d15c -size 3486319 diff --git a/data/datasets/lilac/open-asssistant-conversations/text/near_dup/signal_manifest.json b/data/datasets/lilac/open-asssistant-conversations/text/near_dup/signal_manifest.json deleted file mode 100644 index 6febd3e69b9125b0be3b4778f7b65fab8a5d325b..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/text/near_dup/signal_manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "near_dup(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "near_dup": { - "fields": { - "cluster_id": { - "dtype": "uint32", - "categorical": true - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - } - } - } - } - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet b/data/datasets/lilac/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet deleted file mode 100644 index a02dd0f102a3eda91bb698aa5e5d69c00c156db4..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5c010e414a4379f8c1637c54864c46c7872a7ed0dc26990c5b755581d2073f8b -size 2953059 diff --git a/data/datasets/lilac/open-asssistant-conversations/text/pii/signal_manifest.json b/data/datasets/lilac/open-asssistant-conversations/text/pii/signal_manifest.json deleted file mode 100644 index ecb2e8b26fb5328004c7cd7b5363b79e88acf8ac..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/text/pii/signal_manifest.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "pii(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "pii": { - "fields": { - "emails": { - "repeated_field": { - "dtype": "string_span" - } - }, - "ip_addresses": { - "repeated_field": { - "dtype": "string_span" - } - }, - "secrets": { - "repeated_field": { - "dtype": "string_span" - } - } - }, - "signal": { - "signal_name": "pii" - } - } - } - } - } - }, - "signal": { - "signal_name": "pii" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet b/data/datasets/lilac/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet deleted file mode 100644 index a8f9d4de6526528a6488fb1597e4a130b9430890..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8e011c0efb333a2d028e1be33030bf795fc373f27a3c2ce611099081057df2be -size 5955273 diff --git a/data/datasets/lilac/open-asssistant-conversations/text/spacy_ner/signal_manifest.json b/data/datasets/lilac/open-asssistant-conversations/text/spacy_ner/signal_manifest.json deleted file mode 100644 index 56c8c0883131d3cf35156969931918e4ea2da3a6..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/text/spacy_ner/signal_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "spacy_ner(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "spacy_ner": { - "repeated_field": { - "fields": { - "label": { - "dtype": "string" - } - }, - "dtype": "string_span" - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - } - } - } - } - } - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet deleted file mode 100644 index ad531d7932a5f1ca1820567dcb83b9322ebc7265..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb0f5af1af587a3b083dd7859f9cd4a5cf2943e41396c776db9a2a4f59eb4c9d -size 3827015 diff --git a/data/datasets/lilac/open-asssistant-conversations/text/text_statistics/signal_manifest.json b/data/datasets/lilac/open-asssistant-conversations/text/text_statistics/signal_manifest.json deleted file mode 100644 index 642e3171e68afe580d2043a5f4a9f1331fc85208..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/open-asssistant-conversations/text/text_statistics/signal_manifest.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "text_statistics(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "text_statistics": { - "fields": { - "num_characters": { - "dtype": "int32" - }, - "readability": { - "dtype": "float32" - }, - "log(type_token_ratio)": { - "dtype": "float32" - }, - "frac_non_ascii": { - "dtype": "float32", - "bins": [ - [ - "Low", - null, - 0.15 - ], - [ - "Medium", - 0.15, - 0.3 - ], - [ - "High", - 0.3, - null - ] - ] - } - }, - "signal": { - "signal_name": "text_statistics" - } - } - } - } - } - }, - "signal": { - "signal_name": "text_statistics" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/pile-of-law-atticus-contracts/config.yml b/data/datasets/lilac/pile-of-law-atticus-contracts/config.yml deleted file mode 100644 index 53a4e88a33bc2073f5e71c55f760852b4384902a..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-atticus-contracts/config.yml +++ /dev/null @@ -1,14 +0,0 @@ -name: pile-of-law-atticus-contracts -namespace: lilac -settings: - preferred_embedding: gte-small - ui: - media_paths: - - text -source: - config_name: atticus_contracts - dataset_name: pile-of-law/pile-of-law - sample_size: 100000 - source_name: huggingface -tags: -- legal diff --git a/data/datasets/lilac/pile-of-law-atticus-contracts/data-00000-of-00001.parquet b/data/datasets/lilac/pile-of-law-atticus-contracts/data-00000-of-00001.parquet deleted file mode 100644 index 3025f61a94ea6d0b1dcad7f65e77c55ddf166e66..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-atticus-contracts/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:654fd626e1d336406da59fd0efe676c91e58c3752fa0bf668523df6872d2b131 -size 4778681829 diff --git a/data/datasets/lilac/pile-of-law-atticus-contracts/manifest.json b/data/datasets/lilac/pile-of-law-atticus-contracts/manifest.json deleted file mode 100644 index a5568a64b007a55a0195e0a86ec236567a652b2d..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-atticus-contracts/manifest.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "data_schema": { - "fields": { - "text": { - "dtype": "string" - }, - "created_timestamp": { - "dtype": "string" - }, - "downloaded_timestamp": { - "dtype": "string" - }, - "url": { - "dtype": "string" - }, - "__hfsplit__": { - "dtype": "string" - }, - "__rowid__": { - "dtype": "string" - } - } - } -} \ No newline at end of file diff --git a/data/datasets/lilac/pile-of-law-constitutions/config.yml b/data/datasets/lilac/pile-of-law-constitutions/config.yml deleted file mode 100644 index f343cd5141bb0373fbe891d7953f4297dbeda3ce..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-constitutions/config.yml +++ /dev/null @@ -1,23 +0,0 @@ -embeddings: -- embedding: gte-small - path: text -name: pile-of-law-constitutions -namespace: lilac -settings: - preferred_embedding: gte-small - ui: - media_paths: - - text -signals: -- path: text - signal: - signal_name: near_dup -- path: text - signal: - signal_name: pii -source: - config_name: constitutions - dataset_name: pile-of-law/pile-of-law - source_name: huggingface -tags: -- legal diff --git a/data/datasets/lilac/pile-of-law-constitutions/data-00000-of-00001.parquet b/data/datasets/lilac/pile-of-law-constitutions/data-00000-of-00001.parquet deleted file mode 100644 index f7a8cd27f321c9189f5c4d6e35b7c14798e150fc..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-constitutions/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:95d5ed1a03bb7960e4404a2824e56edc196bc868f2653083d76bd15785354d9b -size 11644007 diff --git a/data/datasets/lilac/pile-of-law-constitutions/manifest.json b/data/datasets/lilac/pile-of-law-constitutions/manifest.json deleted file mode 100644 index a5568a64b007a55a0195e0a86ec236567a652b2d..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-constitutions/manifest.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "data_schema": { - "fields": { - "text": { - "dtype": "string" - }, - "created_timestamp": { - "dtype": "string" - }, - "downloaded_timestamp": { - "dtype": "string" - }, - "url": { - "dtype": "string" - }, - "__hfsplit__": { - "dtype": "string" - }, - "__rowid__": { - "dtype": "string" - } - } - } -} \ No newline at end of file diff --git a/data/datasets/lilac/pile-of-law-constitutions/text/gte-small/hnsw.hnswlib.bin b/data/datasets/lilac/pile-of-law-constitutions/text/gte-small/hnsw.hnswlib.bin deleted file mode 100644 index d995684c0389ebe91d11a91955d8c26392b6fce7..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-constitutions/text/gte-small/hnsw.hnswlib.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e0191d5390ec25b1a5df6558e31eebfb6816ba4b0d056782a1734be53f38ec25 -size 165963072 diff --git a/data/datasets/lilac/pile-of-law-constitutions/text/gte-small/hnsw.lookup.pkl b/data/datasets/lilac/pile-of-law-constitutions/text/gte-small/hnsw.lookup.pkl deleted file mode 100644 index 21c516cc4cb643545719f783e6ae36ba7170c53d..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-constitutions/text/gte-small/hnsw.lookup.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a0391b6a70d478587e8e4880da808b3964711040244e72030392f7c1cdac7ad4 -size 1832634 diff --git a/data/datasets/lilac/pile-of-law-constitutions/text/gte-small/signal_manifest.json b/data/datasets/lilac/pile-of-law-constitutions/text/gte-small/signal_manifest.json deleted file mode 100644 index 6c4df1308025f01afd7ad11c2e4eab36963c61b0..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-constitutions/text/gte-small/signal_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "files": [], - "parquet_id": "gte-small(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "gte-small": { - "repeated_field": { - "fields": { - "embedding": { - "dtype": "embedding" - } - }, - "dtype": "string_span" - }, - "signal": { - "signal_name": "gte-small" - } - } - } - } - } - }, - "signal": { - "signal_name": "gte-small" - }, - "enriched_path": [ - "text" - ], - "vector_store": "hnsw" -} \ No newline at end of file diff --git a/data/datasets/lilac/pile-of-law-constitutions/text/gte-small/spans.pkl b/data/datasets/lilac/pile-of-law-constitutions/text/gte-small/spans.pkl deleted file mode 100644 index 3534fd55b29f9023dda4e8c5a4c658e94c11bb07..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-constitutions/text/gte-small/spans.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8daf511d5157878cacab7058e516638d8c3c354434cb2792f9dd2a83637b21b5 -size 1029851 diff --git a/data/datasets/lilac/pile-of-law-constitutions/text/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/pile-of-law-constitutions/text/near_dup/data-00000-of-00001.parquet deleted file mode 100644 index a41e925249a73726e7e0722530ca93a82588c27f..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/pile-of-law-constitutions/text/near_dup/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/pile-of-law-constitutions/text/near_dup/signal_manifest.json b/data/datasets/lilac/pile-of-law-constitutions/text/near_dup/signal_manifest.json deleted file mode 100644 index 6febd3e69b9125b0be3b4778f7b65fab8a5d325b..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-constitutions/text/near_dup/signal_manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "near_dup(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "near_dup": { - "fields": { - "cluster_id": { - "dtype": "uint32", - "categorical": true - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - } - } - } - } - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/pile-of-law-constitutions/text/pii/data-00000-of-00001.parquet b/data/datasets/lilac/pile-of-law-constitutions/text/pii/data-00000-of-00001.parquet deleted file mode 100644 index ba3b146b6ec65beeee2af6bb2d0897286307d85f..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/pile-of-law-constitutions/text/pii/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/pile-of-law-constitutions/text/pii/signal_manifest.json b/data/datasets/lilac/pile-of-law-constitutions/text/pii/signal_manifest.json deleted file mode 100644 index ecb2e8b26fb5328004c7cd7b5363b79e88acf8ac..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-constitutions/text/pii/signal_manifest.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "pii(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "pii": { - "fields": { - "emails": { - "repeated_field": { - "dtype": "string_span" - } - }, - "ip_addresses": { - "repeated_field": { - "dtype": "string_span" - } - }, - "secrets": { - "repeated_field": { - "dtype": "string_span" - } - } - }, - "signal": { - "signal_name": "pii" - } - } - } - } - } - }, - "signal": { - "signal_name": "pii" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/config.yml b/data/datasets/lilac/pile-of-law-r-legaladvice/config.yml deleted file mode 100644 index 549eba60231ac13311f9a29e6c490e2edcc065b8..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/config.yml +++ /dev/null @@ -1,32 +0,0 @@ -embeddings: -- embedding: gte-small - path: text -name: pile-of-law-r-legaladvice -namespace: lilac -settings: - preferred_embedding: gte-small - ui: - media_paths: - - text -signals: -- path: text - signal: - signal_name: near_dup -- path: text - signal: - signal_name: text_statistics -- path: text - signal: - signal_name: pii -- path: text - signal: - signal_name: lang_detection -- path: text - signal: - signal_name: spacy_ner -source: - config_name: r_legaladvice - dataset_name: pile-of-law/pile-of-law - source_name: huggingface -tags: -- legal diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/data-00000-of-00001.parquet b/data/datasets/lilac/pile-of-law-r-legaladvice/data-00000-of-00001.parquet deleted file mode 100644 index fb97fabdf28a292b11466042e1b31bc8cca7f2cc..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:315910747b04a2b504c0df875caf84812252602ea10a14c2049a45737a80db86 -size 190604294 diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/manifest.json b/data/datasets/lilac/pile-of-law-r-legaladvice/manifest.json deleted file mode 100644 index a5568a64b007a55a0195e0a86ec236567a652b2d..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/manifest.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "data_schema": { - "fields": { - "text": { - "dtype": "string" - }, - "created_timestamp": { - "dtype": "string" - }, - "downloaded_timestamp": { - "dtype": "string" - }, - "url": { - "dtype": "string" - }, - "__hfsplit__": { - "dtype": "string" - }, - "__rowid__": { - "dtype": "string" - } - } - } -} \ No newline at end of file diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/hnsw.hnswlib.bin b/data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/hnsw.hnswlib.bin deleted file mode 100644 index 8ef8df8cacfe7ff6821eca1cc3e17b51ece938b7..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/hnsw.hnswlib.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b613d5dd1584d6a5f8323d1464fa1f9d7a6a3cb8547963a8f3451afc56542a3c -size 1922075524 diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/hnsw.lookup.pkl b/data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/hnsw.lookup.pkl deleted file mode 100644 index 619b37a6c1607c502dd4a36d173dbb19a9d78ad0..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/hnsw.lookup.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f6ea206b1c4dba51a9e18f6db3822615cfe4c0ec435d3ce8c0e78ed4a604d11c -size 24945492 diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/signal_manifest.json b/data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/signal_manifest.json deleted file mode 100644 index 6c4df1308025f01afd7ad11c2e4eab36963c61b0..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/signal_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "files": [], - "parquet_id": "gte-small(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "gte-small": { - "repeated_field": { - "fields": { - "embedding": { - "dtype": "embedding" - } - }, - "dtype": "string_span" - }, - "signal": { - "signal_name": "gte-small" - } - } - } - } - } - }, - "signal": { - "signal_name": "gte-small" - }, - "enriched_path": [ - "text" - ], - "vector_store": "hnsw" -} \ No newline at end of file diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/spans.pkl b/data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/spans.pkl deleted file mode 100644 index 192b9a5531330c94e16a04417907e4e42606b28c..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/spans.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c233892a5f8fb35cdff7539bc657e15b91f91be9410c63e5a6dfc0aa627831ae -size 15099068 diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/text/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/pile-of-law-r-legaladvice/text/lang_detection/data-00000-of-00001.parquet deleted file mode 100644 index 385f81d5b3b5ad7f7d98c6001d9698a2e6639470..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/text/lang_detection/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f09e13588467643bc515248dc8b75003ecab9787b8d30b01959dd4f85d0a5eb0 -size 4827225 diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/text/lang_detection/signal_manifest.json b/data/datasets/lilac/pile-of-law-r-legaladvice/text/lang_detection/signal_manifest.json deleted file mode 100644 index 2163e9fd9535885346193cbb0e9772d01841cd31..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/text/lang_detection/signal_manifest.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "lang_detection(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "lang_detection": { - "dtype": "string", - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - } - } - } - } - } - }, - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/text/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/pile-of-law-r-legaladvice/text/near_dup/data-00000-of-00001.parquet deleted file mode 100644 index 161f7545c3c6c2b955ffe2d1775fbd2a8dbf5dd2..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/text/near_dup/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b3d7c4f4df146865336e8e0addba0ff1bfe0218bd6ef7d9b6cd2b1ef21a20405 -size 5743841 diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/text/near_dup/signal_manifest.json b/data/datasets/lilac/pile-of-law-r-legaladvice/text/near_dup/signal_manifest.json deleted file mode 100644 index 6febd3e69b9125b0be3b4778f7b65fab8a5d325b..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/text/near_dup/signal_manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "near_dup(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "near_dup": { - "fields": { - "cluster_id": { - "dtype": "uint32", - "categorical": true - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - } - } - } - } - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/text/pii/data-00000-of-00001.parquet b/data/datasets/lilac/pile-of-law-r-legaladvice/text/pii/data-00000-of-00001.parquet deleted file mode 100644 index 1baf009a7f79be183ca0036d9b458d01b1f8951c..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/text/pii/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:38ca9bdf0f4a9dd4b0d1bc83f6f6b1446aa5b61c0566e1ffa165dac1565e3099 -size 4834322 diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/text/pii/signal_manifest.json b/data/datasets/lilac/pile-of-law-r-legaladvice/text/pii/signal_manifest.json deleted file mode 100644 index ecb2e8b26fb5328004c7cd7b5363b79e88acf8ac..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/text/pii/signal_manifest.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "pii(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "pii": { - "fields": { - "emails": { - "repeated_field": { - "dtype": "string_span" - } - }, - "ip_addresses": { - "repeated_field": { - "dtype": "string_span" - } - }, - "secrets": { - "repeated_field": { - "dtype": "string_span" - } - } - }, - "signal": { - "signal_name": "pii" - } - } - } - } - } - }, - "signal": { - "signal_name": "pii" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/text/spacy_ner/data-00000-of-00001.parquet b/data/datasets/lilac/pile-of-law-r-legaladvice/text/spacy_ner/data-00000-of-00001.parquet deleted file mode 100644 index 58b7cc581bad05b776b99faf6a95e767aeac2477..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/text/spacy_ner/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:10bb83d20b521ce0035c40b8ab56a0b29f92f728241a71a093ca8c83d5654e8d -size 12134480 diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/text/spacy_ner/signal_manifest.json b/data/datasets/lilac/pile-of-law-r-legaladvice/text/spacy_ner/signal_manifest.json deleted file mode 100644 index 56c8c0883131d3cf35156969931918e4ea2da3a6..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/text/spacy_ner/signal_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "spacy_ner(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "spacy_ner": { - "repeated_field": { - "fields": { - "label": { - "dtype": "string" - } - }, - "dtype": "string_span" - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - } - } - } - } - } - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/text/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/pile-of-law-r-legaladvice/text/text_statistics/data-00000-of-00001.parquet deleted file mode 100644 index 5d0a0c370a9a08c6eef6d06044c84c58d9c6e916..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/text/text_statistics/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bcfb0d11c300e2d97d10c57e63b777cb347936e24253758c716bed24e0b7a2e8 -size 6649363 diff --git a/data/datasets/lilac/pile-of-law-r-legaladvice/text/text_statistics/signal_manifest.json b/data/datasets/lilac/pile-of-law-r-legaladvice/text/text_statistics/signal_manifest.json deleted file mode 100644 index 642e3171e68afe580d2043a5f4a9f1331fc85208..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/pile-of-law-r-legaladvice/text/text_statistics/signal_manifest.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "text_statistics(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "text_statistics": { - "fields": { - "num_characters": { - "dtype": "int32" - }, - "readability": { - "dtype": "float32" - }, - "log(type_token_ratio)": { - "dtype": "float32" - }, - "frac_non_ascii": { - "dtype": "float32", - "bins": [ - [ - "Low", - null, - 0.15 - ], - [ - "Medium", - 0.15, - 0.3 - ], - [ - "High", - 0.3, - null - ] - ] - } - }, - "signal": { - "signal_name": "text_statistics" - } - } - } - } - } - }, - "signal": { - "signal_name": "text_statistics" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/config.yml b/data/datasets/lilac/piqa/config.yml deleted file mode 100644 index d52225af41706646b4a8c53d4bd78bd70dc0669e..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/config.yml +++ /dev/null @@ -1,67 +0,0 @@ -embeddings: -- embedding: gte-small - path: goal -- embedding: gte-small - path: sol2 -- embedding: gte-small - path: sol1 -name: piqa -namespace: local -settings: - preferred_embedding: gte-small - ui: - media_paths: - - sol1 - - sol2 - - goal -signals: -- path: sol1 - signal: - signal_name: near_dup -- path: sol1 - signal: - signal_name: text_statistics -- path: sol1 - signal: - signal_name: pii -- path: sol1 - signal: - signal_name: lang_detection -- path: sol1 - signal: - signal_name: spacy_ner -- path: sol2 - signal: - signal_name: near_dup -- path: sol2 - signal: - signal_name: pii -- path: sol2 - signal: - signal_name: spacy_ner -- path: sol2 - signal: - signal_name: lang_detection -- path: sol2 - signal: - signal_name: text_statistics -- path: goal - signal: - signal_name: near_dup -- path: goal - signal: - signal_name: text_statistics -- path: goal - signal: - signal_name: spacy_ner -- path: goal - signal: - signal_name: lang_detection -- path: goal - signal: - signal_name: pii -source: - dataset_name: piqa - source_name: huggingface -tags: -- machine-learning diff --git a/data/datasets/lilac/piqa/data-00000-of-00001.parquet b/data/datasets/lilac/piqa/data-00000-of-00001.parquet deleted file mode 100644 index bf38c39924376e7eee1170e510f0ee20521fc431..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1530d99cdc0988b355d3dcbcd6f7e29439e4048535b598eaf08f2c372d5a76a4 -size 4040510 diff --git a/data/datasets/lilac/piqa/goal/gte-small/hnsw.hnswlib.bin b/data/datasets/lilac/piqa/goal/gte-small/hnsw.hnswlib.bin deleted file mode 100644 index b5a0eb8ccf59130822d17474d84435ed47ee0b8a..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/goal/gte-small/hnsw.hnswlib.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e46018f32560457c6bb77c8cb6270be81c48730a37e640d70de62b5e75d9b0ad -size 35437836 diff --git a/data/datasets/lilac/piqa/goal/gte-small/hnsw.lookup.pkl b/data/datasets/lilac/piqa/goal/gte-small/hnsw.lookup.pkl deleted file mode 100644 index f70d0ecbbf3e2ce4f690321fca45533e20d67b14..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/goal/gte-small/hnsw.lookup.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:151e20cb34bc6ff100936ca9da15e7443e8ecc23cf68aa10977f90a5f5e3802b -size 1010253 diff --git a/data/datasets/lilac/piqa/goal/gte-small/signal_manifest.json b/data/datasets/lilac/piqa/goal/gte-small/signal_manifest.json deleted file mode 100644 index e38b0540f2a3f21235cb535004316e0c442228f3..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/goal/gte-small/signal_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "files": [], - "parquet_id": "gte-small(goal)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "goal": { - "fields": { - "gte-small": { - "repeated_field": { - "fields": { - "embedding": { - "dtype": "embedding" - } - }, - "dtype": "string_span" - }, - "signal": { - "signal_name": "gte-small" - } - } - } - } - } - }, - "signal": { - "signal_name": "gte-small" - }, - "enriched_path": [ - "goal" - ], - "vector_store": "hnsw" -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/goal/gte-small/spans.pkl b/data/datasets/lilac/piqa/goal/gte-small/spans.pkl deleted file mode 100644 index 4097d71f5a2333d7e96161221ea78a1b0678dce1..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/goal/gte-small/spans.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:af17a91bcfa9ef113f360a6a6a625741b81ba887a577393c23aff0621535b52b -size 1009873 diff --git a/data/datasets/lilac/piqa/goal/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/piqa/goal/lang_detection/data-00000-of-00001.parquet deleted file mode 100644 index 42fc59c8b41da29d5c0d947ef5e41c12f584256b..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/piqa/goal/lang_detection/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/piqa/goal/lang_detection/signal_manifest.json b/data/datasets/lilac/piqa/goal/lang_detection/signal_manifest.json deleted file mode 100644 index 20fd48a65b9a66e2b42b59952a06c9e8c4bd23a5..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/goal/lang_detection/signal_manifest.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "lang_detection(goal)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "goal": { - "fields": { - "lang_detection": { - "dtype": "string", - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - } - } - } - } - } - }, - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - }, - "enriched_path": [ - "goal" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/goal/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/piqa/goal/near_dup/data-00000-of-00001.parquet deleted file mode 100644 index 3f8aa38ac019e6a12a9a42ea72bd90fb46975fe8..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/piqa/goal/near_dup/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/piqa/goal/near_dup/signal_manifest.json b/data/datasets/lilac/piqa/goal/near_dup/signal_manifest.json deleted file mode 100644 index 229d8c6643b134214a13ccfa54b27648bb23aba4..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/goal/near_dup/signal_manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "near_dup(goal)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "goal": { - "fields": { - "near_dup": { - "fields": { - "cluster_id": { - "dtype": "uint32", - "categorical": true - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - } - } - } - } - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - }, - "enriched_path": [ - "goal" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/goal/pii/data-00000-of-00001.parquet b/data/datasets/lilac/piqa/goal/pii/data-00000-of-00001.parquet deleted file mode 100644 index b8419aa671392c0397e45e5b2623ae15bfeaacff..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/piqa/goal/pii/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/piqa/goal/pii/signal_manifest.json b/data/datasets/lilac/piqa/goal/pii/signal_manifest.json deleted file mode 100644 index cf829e25c9fb255627cf66f09e37bb2249cb0644..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/goal/pii/signal_manifest.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "pii(goal)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "goal": { - "fields": { - "pii": { - "fields": { - "emails": { - "repeated_field": { - "dtype": "string_span" - } - }, - "ip_addresses": { - "repeated_field": { - "dtype": "string_span" - } - }, - "secrets": { - "repeated_field": { - "dtype": "string_span" - } - } - }, - "signal": { - "signal_name": "pii" - } - } - } - } - } - }, - "signal": { - "signal_name": "pii" - }, - "enriched_path": [ - "goal" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/goal/spacy_ner/data-00000-of-00001.parquet b/data/datasets/lilac/piqa/goal/spacy_ner/data-00000-of-00001.parquet deleted file mode 100644 index 858338b2d642926da9ce11c9868cf7c2884bb8f5..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/piqa/goal/spacy_ner/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/piqa/goal/spacy_ner/signal_manifest.json b/data/datasets/lilac/piqa/goal/spacy_ner/signal_manifest.json deleted file mode 100644 index b15ae045aa9216a6cc56fe1a82a2cb13b5661db5..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/goal/spacy_ner/signal_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "spacy_ner(goal)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "goal": { - "fields": { - "spacy_ner": { - "repeated_field": { - "fields": { - "label": { - "dtype": "string" - } - }, - "dtype": "string_span" - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - } - } - } - } - } - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - }, - "enriched_path": [ - "goal" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/goal/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/piqa/goal/text_statistics/data-00000-of-00001.parquet deleted file mode 100644 index 2db9b856db171622031762b075eadb1cd201b2f2..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/piqa/goal/text_statistics/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/piqa/goal/text_statistics/signal_manifest.json b/data/datasets/lilac/piqa/goal/text_statistics/signal_manifest.json deleted file mode 100644 index 7411f33b9cf775aa384d9cf6868101647212599d..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/goal/text_statistics/signal_manifest.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "text_statistics(goal)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "goal": { - "fields": { - "text_statistics": { - "fields": { - "num_characters": { - "dtype": "int32" - }, - "readability": { - "dtype": "float32" - }, - "log(type_token_ratio)": { - "dtype": "float32" - }, - "frac_non_ascii": { - "dtype": "float32", - "bins": [ - [ - "Low", - null, - 0.15 - ], - [ - "Medium", - 0.15, - 0.3 - ], - [ - "High", - 0.3, - null - ] - ] - } - }, - "signal": { - "signal_name": "text_statistics" - } - } - } - } - } - }, - "signal": { - "signal_name": "text_statistics" - }, - "enriched_path": [ - "goal" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/manifest.json b/data/datasets/lilac/piqa/manifest.json deleted file mode 100644 index cc381f565472c6d4c2588d10e9d9d19e5f864d26..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/manifest.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "data_schema": { - "fields": { - "goal": { - "dtype": "string" - }, - "sol1": { - "dtype": "string" - }, - "sol2": { - "dtype": "string" - }, - "label": { - "dtype": "string" - }, - "__hfsplit__": { - "dtype": "string" - }, - "__rowid__": { - "dtype": "string" - } - } - } -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/sol1/gte-small/hnsw.hnswlib.bin b/data/datasets/lilac/piqa/sol1/gte-small/hnsw.hnswlib.bin deleted file mode 100644 index 7a236ec29b4499a1b55a9d01fc1e977dbf925744..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol1/gte-small/hnsw.hnswlib.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:91976ea520234c445b947e745fabddaae0cf455d2a8e8d18ccb945905c642ed2 -size 36174988 diff --git a/data/datasets/lilac/piqa/sol1/gte-small/hnsw.lookup.pkl b/data/datasets/lilac/piqa/sol1/gte-small/hnsw.lookup.pkl deleted file mode 100644 index 4016b3e5cbc2eb7d0a2d2c9bc814d4989bf4b5c2..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol1/gte-small/hnsw.lookup.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1333c87108731a3851176d7751a918e8a170de5a6d4e0e0a5b79f8663b37494e -size 1018110 diff --git a/data/datasets/lilac/piqa/sol1/gte-small/signal_manifest.json b/data/datasets/lilac/piqa/sol1/gte-small/signal_manifest.json deleted file mode 100644 index e2fc00649d968a40bcfd6e8b602475aff580b5e8..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol1/gte-small/signal_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "files": [], - "parquet_id": "gte-small(sol1)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "sol1": { - "fields": { - "gte-small": { - "repeated_field": { - "fields": { - "embedding": { - "dtype": "embedding" - } - }, - "dtype": "string_span" - }, - "signal": { - "signal_name": "gte-small" - } - } - } - } - } - }, - "signal": { - "signal_name": "gte-small" - }, - "enriched_path": [ - "sol1" - ], - "vector_store": "hnsw" -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/sol1/gte-small/spans.pkl b/data/datasets/lilac/piqa/sol1/gte-small/spans.pkl deleted file mode 100644 index 37a2fbf2f14cc8ce3bdfffb6d99a9b3e68838d6c..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol1/gte-small/spans.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b043969052fae73d232c222d10362ce093ca26b9224b6ea14c87cf65affd04df -size 1015221 diff --git a/data/datasets/lilac/piqa/sol1/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/piqa/sol1/lang_detection/data-00000-of-00001.parquet deleted file mode 100644 index ea0da12d380c06c0a8d6a3db11b768e1583bbb64..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/piqa/sol1/lang_detection/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/piqa/sol1/lang_detection/signal_manifest.json b/data/datasets/lilac/piqa/sol1/lang_detection/signal_manifest.json deleted file mode 100644 index c86df1c4c09f3bff65d0223549be1df392b9a8eb..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol1/lang_detection/signal_manifest.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "lang_detection(sol1)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "sol1": { - "fields": { - "lang_detection": { - "dtype": "string", - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - } - } - } - } - } - }, - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - }, - "enriched_path": [ - "sol1" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/sol1/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/piqa/sol1/near_dup/data-00000-of-00001.parquet deleted file mode 100644 index 290a52d50eb6aea1f5cf4abd78dca49dbb411f22..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/piqa/sol1/near_dup/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/piqa/sol1/near_dup/signal_manifest.json b/data/datasets/lilac/piqa/sol1/near_dup/signal_manifest.json deleted file mode 100644 index ce5c8c5432a9475af951a5ab8f895132940bad1a..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol1/near_dup/signal_manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "near_dup(sol1)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "sol1": { - "fields": { - "near_dup": { - "fields": { - "cluster_id": { - "dtype": "uint32", - "categorical": true - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - } - } - } - } - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - }, - "enriched_path": [ - "sol1" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/sol1/pii/data-00000-of-00001.parquet b/data/datasets/lilac/piqa/sol1/pii/data-00000-of-00001.parquet deleted file mode 100644 index 193364c2e59d6967ba5d2e00791cf6440b26be50..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/piqa/sol1/pii/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/piqa/sol1/pii/signal_manifest.json b/data/datasets/lilac/piqa/sol1/pii/signal_manifest.json deleted file mode 100644 index 5a53e8f453f2df2e34367079c7d52e5499dc07d0..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol1/pii/signal_manifest.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "pii(sol1)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "sol1": { - "fields": { - "pii": { - "fields": { - "emails": { - "repeated_field": { - "dtype": "string_span" - } - }, - "ip_addresses": { - "repeated_field": { - "dtype": "string_span" - } - }, - "secrets": { - "repeated_field": { - "dtype": "string_span" - } - } - }, - "signal": { - "signal_name": "pii" - } - } - } - } - } - }, - "signal": { - "signal_name": "pii" - }, - "enriched_path": [ - "sol1" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/sol1/spacy_ner/data-00000-of-00001.parquet b/data/datasets/lilac/piqa/sol1/spacy_ner/data-00000-of-00001.parquet deleted file mode 100644 index 22497734966267a9a15a47aa3a79b7f1bb544e57..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/piqa/sol1/spacy_ner/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/piqa/sol1/spacy_ner/signal_manifest.json b/data/datasets/lilac/piqa/sol1/spacy_ner/signal_manifest.json deleted file mode 100644 index 2c35db410ee04e0913985a784c241340a0899043..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol1/spacy_ner/signal_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "spacy_ner(sol1)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "sol1": { - "fields": { - "spacy_ner": { - "repeated_field": { - "fields": { - "label": { - "dtype": "string" - } - }, - "dtype": "string_span" - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - } - } - } - } - } - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - }, - "enriched_path": [ - "sol1" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/sol1/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/piqa/sol1/text_statistics/data-00000-of-00001.parquet deleted file mode 100644 index 9706fd142b5c777bc00e63a5f3229b39c299f41b..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/piqa/sol1/text_statistics/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/piqa/sol1/text_statistics/signal_manifest.json b/data/datasets/lilac/piqa/sol1/text_statistics/signal_manifest.json deleted file mode 100644 index d8177cbf4d1e85fb3064088ed71b0db1977cebcf..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol1/text_statistics/signal_manifest.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "text_statistics(sol1)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "sol1": { - "fields": { - "text_statistics": { - "fields": { - "num_characters": { - "dtype": "int32" - }, - "readability": { - "dtype": "float32" - }, - "log(type_token_ratio)": { - "dtype": "float32" - }, - "frac_non_ascii": { - "dtype": "float32", - "bins": [ - [ - "Low", - null, - 0.15 - ], - [ - "Medium", - 0.15, - 0.3 - ], - [ - "High", - 0.3, - null - ] - ] - } - }, - "signal": { - "signal_name": "text_statistics" - } - } - } - } - } - }, - "signal": { - "signal_name": "text_statistics" - }, - "enriched_path": [ - "sol1" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin b/data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin deleted file mode 100644 index ca467cc92758852bc183f82d923710481eed6a83..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b61683bccd432e615553a1e508b236f265663e74ec67c60edb4f368e169d7b65 -size 36180028 diff --git a/data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl b/data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl deleted file mode 100644 index 77935440f5af04e0a91346a9f61a07725c0e4972..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d814ae715f918b41488589fabed6fba617820b1619c484e437a6d575a89692c3 -size 1018164 diff --git a/data/datasets/lilac/piqa/sol2/gte-small/signal_manifest.json b/data/datasets/lilac/piqa/sol2/gte-small/signal_manifest.json deleted file mode 100644 index ea3076918836fe1c827391a2a3a39f2282bbbfe5..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol2/gte-small/signal_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "files": [], - "parquet_id": "gte-small(sol2)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "sol2": { - "fields": { - "gte-small": { - "repeated_field": { - "fields": { - "embedding": { - "dtype": "embedding" - } - }, - "dtype": "string_span" - }, - "signal": { - "signal_name": "gte-small" - } - } - } - } - } - }, - "signal": { - "signal_name": "gte-small" - }, - "enriched_path": [ - "sol2" - ], - "vector_store": "hnsw" -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/sol2/gte-small/spans.pkl b/data/datasets/lilac/piqa/sol2/gte-small/spans.pkl deleted file mode 100644 index e53d139449545d1a8ebfd44cfa159c7ad97d06e0..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol2/gte-small/spans.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1bae45d635be51218f7dbf110a5fd4935d7afb802dd79425d18164161e2f1645 -size 1015232 diff --git a/data/datasets/lilac/piqa/sol2/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/piqa/sol2/lang_detection/data-00000-of-00001.parquet deleted file mode 100644 index ed2b6372ff6de411e3b56e1cd5fbe6ea81faacbd..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/piqa/sol2/lang_detection/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/piqa/sol2/lang_detection/signal_manifest.json b/data/datasets/lilac/piqa/sol2/lang_detection/signal_manifest.json deleted file mode 100644 index 9cfe8b083d788adcdabc203d60f3bd1018f7f4d7..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol2/lang_detection/signal_manifest.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "lang_detection(sol2)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "sol2": { - "fields": { - "lang_detection": { - "dtype": "string", - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - } - } - } - } - } - }, - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - }, - "enriched_path": [ - "sol2" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/sol2/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/piqa/sol2/near_dup/data-00000-of-00001.parquet deleted file mode 100644 index 5608922e59505fb08b564fc9fd2920ff580cbeea..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/piqa/sol2/near_dup/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/piqa/sol2/near_dup/signal_manifest.json b/data/datasets/lilac/piqa/sol2/near_dup/signal_manifest.json deleted file mode 100644 index 3683dfd250bec75cbdf7c6e72c395e322d4c747f..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol2/near_dup/signal_manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "near_dup(sol2)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "sol2": { - "fields": { - "near_dup": { - "fields": { - "cluster_id": { - "dtype": "uint32", - "categorical": true - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - } - } - } - } - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - }, - "enriched_path": [ - "sol2" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/sol2/pii/data-00000-of-00001.parquet b/data/datasets/lilac/piqa/sol2/pii/data-00000-of-00001.parquet deleted file mode 100644 index f9a93454f68babb682a06575688013ad9ac41660..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/piqa/sol2/pii/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/piqa/sol2/pii/signal_manifest.json b/data/datasets/lilac/piqa/sol2/pii/signal_manifest.json deleted file mode 100644 index 850d74ea1694d636715e818ac90fd3261354272e..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol2/pii/signal_manifest.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "pii(sol2)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "sol2": { - "fields": { - "pii": { - "fields": { - "emails": { - "repeated_field": { - "dtype": "string_span" - } - }, - "ip_addresses": { - "repeated_field": { - "dtype": "string_span" - } - }, - "secrets": { - "repeated_field": { - "dtype": "string_span" - } - } - }, - "signal": { - "signal_name": "pii" - } - } - } - } - } - }, - "signal": { - "signal_name": "pii" - }, - "enriched_path": [ - "sol2" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/sol2/spacy_ner/data-00000-of-00001.parquet b/data/datasets/lilac/piqa/sol2/spacy_ner/data-00000-of-00001.parquet deleted file mode 100644 index 6df977295c7fcbb0d60d4db186775a7eef8960cc..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/piqa/sol2/spacy_ner/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/piqa/sol2/spacy_ner/signal_manifest.json b/data/datasets/lilac/piqa/sol2/spacy_ner/signal_manifest.json deleted file mode 100644 index 86a39a10f197a125bd652bf5e91465d213fa4b6f..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol2/spacy_ner/signal_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "spacy_ner(sol2)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "sol2": { - "fields": { - "spacy_ner": { - "repeated_field": { - "fields": { - "label": { - "dtype": "string" - } - }, - "dtype": "string_span" - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - } - } - } - } - } - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - }, - "enriched_path": [ - "sol2" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/piqa/sol2/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/piqa/sol2/text_statistics/data-00000-of-00001.parquet deleted file mode 100644 index 5a3e37e4c595d75bfb1e294270c15fa5b9caf8eb..0000000000000000000000000000000000000000 Binary files a/data/datasets/lilac/piqa/sol2/text_statistics/data-00000-of-00001.parquet and /dev/null differ diff --git a/data/datasets/lilac/piqa/sol2/text_statistics/signal_manifest.json b/data/datasets/lilac/piqa/sol2/text_statistics/signal_manifest.json deleted file mode 100644 index 1bc865bf833b4da708fd65d3f366c80188498b0e..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/piqa/sol2/text_statistics/signal_manifest.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "text_statistics(sol2)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "sol2": { - "fields": { - "text_statistics": { - "fields": { - "num_characters": { - "dtype": "int32" - }, - "readability": { - "dtype": "float32" - }, - "log(type_token_ratio)": { - "dtype": "float32" - }, - "frac_non_ascii": { - "dtype": "float32", - "bins": [ - [ - "Low", - null, - 0.15 - ], - [ - "Medium", - 0.15, - 0.3 - ], - [ - "High", - 0.3, - null - ] - ] - } - }, - "signal": { - "signal_name": "text_statistics" - } - } - } - } - } - }, - "signal": { - "signal_name": "text_statistics" - }, - "enriched_path": [ - "sol2" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/answers/text/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/squad_v2/answers/text/lang_detection/data-00000-of-00001.parquet deleted file mode 100644 index 18370c77f27621e4898136157d8b4601c7d496fc..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/answers/text/lang_detection/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c03103ba132a7209461f86bd1045431d06db431930344e4bdf97236347cc2164 -size 4738120 diff --git a/data/datasets/lilac/squad_v2/answers/text/lang_detection/signal_manifest.json b/data/datasets/lilac/squad_v2/answers/text/lang_detection/signal_manifest.json deleted file mode 100644 index d9140bd6b0a3567aa8f4f54dca03ff53dde54c62..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/answers/text/lang_detection/signal_manifest.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "lang_detection(answers.text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "answers": { - "fields": { - "text": { - "repeated_field": { - "fields": { - "lang_detection": { - "dtype": "string", - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - } - } - } - } - } - } - } - } - }, - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - }, - "enriched_path": [ - "answers", - "text", - "*" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/answers/text/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/squad_v2/answers/text/near_dup/data-00000-of-00001.parquet deleted file mode 100644 index 1ac3c2fcf06bafc847c9d371316e3aebcc95eaf4..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/answers/text/near_dup/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cf4ae7259d126104da2aea0e1fad0c7cd83033f7774f0d44a2436f7c891fde34 -size 5224344 diff --git a/data/datasets/lilac/squad_v2/answers/text/near_dup/signal_manifest.json b/data/datasets/lilac/squad_v2/answers/text/near_dup/signal_manifest.json deleted file mode 100644 index 8832994bd6398ffe3af09c791f63af9b655eb9e5..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/answers/text/near_dup/signal_manifest.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "near_dup(answers.text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "answers": { - "fields": { - "text": { - "repeated_field": { - "fields": { - "near_dup": { - "fields": { - "cluster_id": { - "dtype": "uint32", - "categorical": true - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - } - } - } - } - } - } - } - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - }, - "enriched_path": [ - "answers", - "text", - "*" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/answers/text/pii/data-00000-of-00001.parquet b/data/datasets/lilac/squad_v2/answers/text/pii/data-00000-of-00001.parquet deleted file mode 100644 index fc02cf578d3adb754281fb2428acff8aa5aeb050..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/answers/text/pii/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:adf2c35877ae9957a987049c40a9a1b2edbe4b2d93b1da86bfeb739fae240040 -size 4841393 diff --git a/data/datasets/lilac/squad_v2/answers/text/pii/signal_manifest.json b/data/datasets/lilac/squad_v2/answers/text/pii/signal_manifest.json deleted file mode 100644 index 8495a5ffb37e173e3df5160432a4e580399db30a..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/answers/text/pii/signal_manifest.json +++ /dev/null @@ -1,53 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "pii(answers.text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "answers": { - "fields": { - "text": { - "repeated_field": { - "fields": { - "pii": { - "fields": { - "emails": { - "repeated_field": { - "dtype": "string_span" - } - }, - "ip_addresses": { - "repeated_field": { - "dtype": "string_span" - } - }, - "secrets": { - "repeated_field": { - "dtype": "string_span" - } - } - }, - "signal": { - "signal_name": "pii" - } - } - } - } - } - } - } - } - }, - "signal": { - "signal_name": "pii" - }, - "enriched_path": [ - "answers", - "text", - "*" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/answers/text/spacy_ner/data-00000-of-00001.parquet b/data/datasets/lilac/squad_v2/answers/text/spacy_ner/data-00000-of-00001.parquet deleted file mode 100644 index 64aed13c7603c98bf6d1b2a893ffa6a898a2352a..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/answers/text/spacy_ner/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:86cfdb80cf22a545cd01d557ae3396e942fa9679990c43457d110a6dac6a2d78 -size 5041580 diff --git a/data/datasets/lilac/squad_v2/answers/text/spacy_ner/signal_manifest.json b/data/datasets/lilac/squad_v2/answers/text/spacy_ner/signal_manifest.json deleted file mode 100644 index 4c5428a0a2901c9d5b507c22e5656a9b197af781..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/answers/text/spacy_ner/signal_manifest.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "spacy_ner(answers.text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "answers": { - "fields": { - "text": { - "repeated_field": { - "fields": { - "spacy_ner": { - "repeated_field": { - "fields": { - "label": { - "dtype": "string" - } - }, - "dtype": "string_span" - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - } - } - } - } - } - } - } - } - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - }, - "enriched_path": [ - "answers", - "text", - "*" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/answers/text/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/squad_v2/answers/text/text_statistics/data-00000-of-00001.parquet deleted file mode 100644 index 59aceaf596dd4f7485424af0a6567f6b5223f685..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/answers/text/text_statistics/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:16b5ddbc51455341a26121c5427bd0f32639515dad34d77561402df81d8ab903 -size 5100206 diff --git a/data/datasets/lilac/squad_v2/answers/text/text_statistics/signal_manifest.json b/data/datasets/lilac/squad_v2/answers/text/text_statistics/signal_manifest.json deleted file mode 100644 index 05168462e15f9fa6579b65b4ac047f19ac184184..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/answers/text/text_statistics/signal_manifest.json +++ /dev/null @@ -1,67 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "text_statistics(answers.text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "answers": { - "fields": { - "text": { - "repeated_field": { - "fields": { - "text_statistics": { - "fields": { - "num_characters": { - "dtype": "int32" - }, - "readability": { - "dtype": "float32" - }, - "log(type_token_ratio)": { - "dtype": "float32" - }, - "frac_non_ascii": { - "dtype": "float32", - "bins": [ - [ - "Low", - null, - 0.15 - ], - [ - "Medium", - 0.15, - 0.3 - ], - [ - "High", - 0.3, - null - ] - ] - } - }, - "signal": { - "signal_name": "text_statistics" - } - } - } - } - } - } - } - } - }, - "signal": { - "signal_name": "text_statistics" - }, - "enriched_path": [ - "answers", - "text", - "*" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/config.yml b/data/datasets/lilac/squad_v2/config.yml deleted file mode 100644 index c08953432e5cace62d0ec2f9db9e4f6ebb68a08f..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/config.yml +++ /dev/null @@ -1,80 +0,0 @@ -embeddings: -- embedding: gte-small - path: context -name: squad_v2 -namespace: local -settings: - preferred_embedding: gte-small - ui: - media_paths: - - context - - question - - - answers - - text - - '*' -signals: -- path: context - signal: - signal_name: text_statistics -- path: context - signal: - signal_name: pii -- path: context - signal: - signal_name: near_dup -- path: question - signal: - signal_name: spacy_ner -- path: question - signal: - signal_name: pii -- path: - - answers - - text - - '*' - signal: - signal_name: pii -- path: - - answers - - text - - '*' - signal: - signal_name: spacy_ner -- path: - - answers - - text - - '*' - signal: - signal_name: near_dup -- path: context - signal: - signal_name: lang_detection -- path: - - answers - - text - - '*' - signal: - signal_name: lang_detection -- path: question - signal: - signal_name: near_dup -- path: question - signal: - signal_name: lang_detection -- path: - - answers - - text - - '*' - signal: - signal_name: text_statistics -- path: question - signal: - signal_name: text_statistics -- path: context - signal: - signal_name: spacy_ner -source: - dataset_name: squad_v2 - source_name: huggingface -tags: -- machine-learning diff --git a/data/datasets/lilac/squad_v2/context/gte-small/hnsw.hnswlib.bin b/data/datasets/lilac/squad_v2/context/gte-small/hnsw.hnswlib.bin deleted file mode 100644 index f4feaca6fddbbe1b2a3f74deee7d786b1bdc5651..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/context/gte-small/hnsw.hnswlib.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e24a3d0200e46bb221dacc8066ccc85033ff0378721338cfd60612f130e034d1 -size 601394376 diff --git a/data/datasets/lilac/squad_v2/context/gte-small/hnsw.lookup.pkl b/data/datasets/lilac/squad_v2/context/gte-small/hnsw.lookup.pkl deleted file mode 100644 index c41e5ecd61d84ce1484cdaaa03a6698179d4c1a2..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/context/gte-small/hnsw.lookup.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bc41116b96f4e1fa547697ce62afe0fe7aba054a8d694b308e1e0270474801da -size 10694495 diff --git a/data/datasets/lilac/squad_v2/context/gte-small/signal_manifest.json b/data/datasets/lilac/squad_v2/context/gte-small/signal_manifest.json deleted file mode 100644 index 772cd149b1871246d577334ba66c97eba2970f67..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/context/gte-small/signal_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "files": [], - "parquet_id": "gte-small(context)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "context": { - "fields": { - "gte-small": { - "repeated_field": { - "fields": { - "embedding": { - "dtype": "embedding" - } - }, - "dtype": "string_span" - }, - "signal": { - "signal_name": "gte-small" - } - } - } - } - } - }, - "signal": { - "signal_name": "gte-small" - }, - "enriched_path": [ - "context" - ], - "vector_store": "hnsw" -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/context/gte-small/spans.pkl b/data/datasets/lilac/squad_v2/context/gte-small/spans.pkl deleted file mode 100644 index c35befc2653003f14dca0488d4d983f4aa2fac4f..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/context/gte-small/spans.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d69a524ee48c0c218eeb901ae265ae74b12511fee17fe31ae1627c0122e25f04 -size 8815907 diff --git a/data/datasets/lilac/squad_v2/context/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/squad_v2/context/lang_detection/data-00000-of-00001.parquet deleted file mode 100644 index ff2336747f6c1a09d311195ce9e9ab086f2e0f65..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/context/lang_detection/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cd10704958387059935d9e22f0415677a6daf3105105af8314314ce3c3114274 -size 4682949 diff --git a/data/datasets/lilac/squad_v2/context/lang_detection/signal_manifest.json b/data/datasets/lilac/squad_v2/context/lang_detection/signal_manifest.json deleted file mode 100644 index a9c658f13553ec6197f0b5342c878d96956f32b3..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/context/lang_detection/signal_manifest.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "lang_detection(context)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "context": { - "fields": { - "lang_detection": { - "dtype": "string", - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - } - } - } - } - } - }, - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - }, - "enriched_path": [ - "context" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/context/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/squad_v2/context/near_dup/data-00000-of-00001.parquet deleted file mode 100644 index 9ec286b9d904b07a9b8c17d99d0c1a2f0f4f040c..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/context/near_dup/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e0d3a359a05aa7c073900e4973569f808afa26c7bf0328c31e553efcc14bea90 -size 4962702 diff --git a/data/datasets/lilac/squad_v2/context/near_dup/signal_manifest.json b/data/datasets/lilac/squad_v2/context/near_dup/signal_manifest.json deleted file mode 100644 index 74e3d7239cf63d33a84ce5aba5a88cecd5dc6873..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/context/near_dup/signal_manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "near_dup(context)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "context": { - "fields": { - "near_dup": { - "fields": { - "cluster_id": { - "dtype": "uint32", - "categorical": true - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - } - } - } - } - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - }, - "enriched_path": [ - "context" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/context/pii/data-00000-of-00001.parquet b/data/datasets/lilac/squad_v2/context/pii/data-00000-of-00001.parquet deleted file mode 100644 index 901d98cbc959ebfa6f6fc072505bfde60008f61c..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/context/pii/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0265330a9d7ff27498f4f0e9ddce89a027203d11941d6bc8f8d4334872346d9c -size 4685328 diff --git a/data/datasets/lilac/squad_v2/context/pii/signal_manifest.json b/data/datasets/lilac/squad_v2/context/pii/signal_manifest.json deleted file mode 100644 index 5e2c68f4626d74914590e8adf2e0b726291bc900..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/context/pii/signal_manifest.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "pii(context)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "context": { - "fields": { - "pii": { - "fields": { - "emails": { - "repeated_field": { - "dtype": "string_span" - } - }, - "ip_addresses": { - "repeated_field": { - "dtype": "string_span" - } - }, - "secrets": { - "repeated_field": { - "dtype": "string_span" - } - } - }, - "signal": { - "signal_name": "pii" - } - } - } - } - } - }, - "signal": { - "signal_name": "pii" - }, - "enriched_path": [ - "context" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/context/spacy_ner/data-00000-of-00001.parquet b/data/datasets/lilac/squad_v2/context/spacy_ner/data-00000-of-00001.parquet deleted file mode 100644 index d7d65f3ff575119240df48862f562231e8d76dea..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/context/spacy_ner/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ffe9e61a5449146e9b7cf725b80f46277b359a1aca1d04fe15c5bae4e9f286f1 -size 9241062 diff --git a/data/datasets/lilac/squad_v2/context/spacy_ner/signal_manifest.json b/data/datasets/lilac/squad_v2/context/spacy_ner/signal_manifest.json deleted file mode 100644 index e7510ea1f84e6a100fb8dd99636c48dd525db878..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/context/spacy_ner/signal_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "spacy_ner(context)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "context": { - "fields": { - "spacy_ner": { - "repeated_field": { - "fields": { - "label": { - "dtype": "string" - } - }, - "dtype": "string_span" - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - } - } - } - } - } - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - }, - "enriched_path": [ - "context" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/context/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/squad_v2/context/text_statistics/data-00000-of-00001.parquet deleted file mode 100644 index fe26d3844a63b0a797c292e402ed812119e15c38..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/context/text_statistics/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f1915970f9199dc82019cbb6089c85df3ddfd189848e0f34f549e34b617cd0f8 -size 5165481 diff --git a/data/datasets/lilac/squad_v2/context/text_statistics/signal_manifest.json b/data/datasets/lilac/squad_v2/context/text_statistics/signal_manifest.json deleted file mode 100644 index e2b1f42d5fa12885616ea8f299e070aea59da166..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/context/text_statistics/signal_manifest.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "text_statistics(context)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "context": { - "fields": { - "text_statistics": { - "fields": { - "num_characters": { - "dtype": "int32" - }, - "readability": { - "dtype": "float32" - }, - "log(type_token_ratio)": { - "dtype": "float32" - }, - "frac_non_ascii": { - "dtype": "float32", - "bins": [ - [ - "Low", - null, - 0.15 - ], - [ - "Medium", - 0.15, - 0.3 - ], - [ - "High", - 0.3, - null - ] - ] - } - }, - "signal": { - "signal_name": "text_statistics" - } - } - } - } - } - }, - "signal": { - "signal_name": "text_statistics" - }, - "enriched_path": [ - "context" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/data-00000-of-00001.parquet b/data/datasets/lilac/squad_v2/data-00000-of-00001.parquet deleted file mode 100644 index ccd293f00e4d1767ee97582ad38417d6e0470549..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a9f54db85b8bacd3ea30ecf70410441e300c783e621767c1d9746d6474852ceb -size 27086838 diff --git a/data/datasets/lilac/squad_v2/manifest.json b/data/datasets/lilac/squad_v2/manifest.json deleted file mode 100644 index 95e901df7d68ab16d58b36a5df33832aa858af99..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/manifest.json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "data_schema": { - "fields": { - "id": { - "dtype": "string" - }, - "title": { - "dtype": "string" - }, - "context": { - "dtype": "string" - }, - "question": { - "dtype": "string" - }, - "answers": { - "fields": { - "text": { - "repeated_field": { - "dtype": "string" - } - }, - "answer_start": { - "repeated_field": { - "dtype": "int32" - } - } - } - }, - "__hfsplit__": { - "dtype": "string" - }, - "__rowid__": { - "dtype": "string" - } - } - } -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/question/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/squad_v2/question/lang_detection/data-00000-of-00001.parquet deleted file mode 100644 index 3b5fc2a8b43e007eabb9a61b4a92cd7c182b8c87..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/question/lang_detection/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:06d32aa3096865a6236fd620a16499876c919b245e26fe9a2809b3c02eebc13d -size 4694280 diff --git a/data/datasets/lilac/squad_v2/question/lang_detection/signal_manifest.json b/data/datasets/lilac/squad_v2/question/lang_detection/signal_manifest.json deleted file mode 100644 index 973db6d92666ad6b4e67edd70b041618e1ed5be7..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/question/lang_detection/signal_manifest.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "lang_detection(question)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "question": { - "fields": { - "lang_detection": { - "dtype": "string", - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - } - } - } - } - } - }, - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - }, - "enriched_path": [ - "question" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/question/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/squad_v2/question/near_dup/data-00000-of-00001.parquet deleted file mode 100644 index 97ba0ef1589b7603fdcf116f03b2d5ddfc17ad0d..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/question/near_dup/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9b7e1d521750d16c37c70c378306ef22916e2d9715a565f1127d9e3626c966d4 -size 5571030 diff --git a/data/datasets/lilac/squad_v2/question/near_dup/signal_manifest.json b/data/datasets/lilac/squad_v2/question/near_dup/signal_manifest.json deleted file mode 100644 index 3da5c66d3d35050bd9590d6ee73d0107dfc6d788..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/question/near_dup/signal_manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "near_dup(question)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "question": { - "fields": { - "near_dup": { - "fields": { - "cluster_id": { - "dtype": "uint32", - "categorical": true - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - } - } - } - } - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - }, - "enriched_path": [ - "question" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/question/pii/data-00000-of-00001.parquet b/data/datasets/lilac/squad_v2/question/pii/data-00000-of-00001.parquet deleted file mode 100644 index c42879fa94caac7faeae4af1437d0b7c249c54ee..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/question/pii/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a72f4f32331de183cfe67be224ba473ec83ba8f855dafab97371580684718e4f -size 4685523 diff --git a/data/datasets/lilac/squad_v2/question/pii/signal_manifest.json b/data/datasets/lilac/squad_v2/question/pii/signal_manifest.json deleted file mode 100644 index 7f82232c8066c00959e30503f41c1d852627a812..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/question/pii/signal_manifest.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "pii(question)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "question": { - "fields": { - "pii": { - "fields": { - "emails": { - "repeated_field": { - "dtype": "string_span" - } - }, - "ip_addresses": { - "repeated_field": { - "dtype": "string_span" - } - }, - "secrets": { - "repeated_field": { - "dtype": "string_span" - } - } - }, - "signal": { - "signal_name": "pii" - } - } - } - } - } - }, - "signal": { - "signal_name": "pii" - }, - "enriched_path": [ - "question" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/question/spacy_ner/data-00000-of-00001.parquet b/data/datasets/lilac/squad_v2/question/spacy_ner/data-00000-of-00001.parquet deleted file mode 100644 index 630e548861b372eded8e9e3cdfa5fc754fa00a2e..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/question/spacy_ner/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ca09ac95100482b232d2b3c7e9a08490ecb056d10b9e91a23170df6669482a3b -size 5286963 diff --git a/data/datasets/lilac/squad_v2/question/spacy_ner/signal_manifest.json b/data/datasets/lilac/squad_v2/question/spacy_ner/signal_manifest.json deleted file mode 100644 index 44045a52264b58737ad4ef9def2f7194a481a762..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/question/spacy_ner/signal_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "spacy_ner(question)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "question": { - "fields": { - "spacy_ner": { - "repeated_field": { - "fields": { - "label": { - "dtype": "string" - } - }, - "dtype": "string_span" - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - } - } - } - } - } - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - }, - "enriched_path": [ - "question" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/squad_v2/question/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/squad_v2/question/text_statistics/data-00000-of-00001.parquet deleted file mode 100644 index 44faabb88c9e0ccdf1e7b0d0a7ba994ec0d4c910..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/question/text_statistics/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:36304efa2147c8737c1c4da192837aa855504ecfe9eb6f14d267c501bcaaa246 -size 5104750 diff --git a/data/datasets/lilac/squad_v2/question/text_statistics/signal_manifest.json b/data/datasets/lilac/squad_v2/question/text_statistics/signal_manifest.json deleted file mode 100644 index e52fe6eef5bd2c1f65350e1b161e182aec82e7f6..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/squad_v2/question/text_statistics/signal_manifest.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "text_statistics(question)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "question": { - "fields": { - "text_statistics": { - "fields": { - "num_characters": { - "dtype": "int32" - }, - "readability": { - "dtype": "float32" - }, - "log(type_token_ratio)": { - "dtype": "float32" - }, - "frac_non_ascii": { - "dtype": "float32", - "bins": [ - [ - "Low", - null, - 0.15 - ], - [ - "Medium", - 0.15, - 0.3 - ], - [ - "High", - 0.3, - null - ] - ] - } - }, - "signal": { - "signal_name": "text_statistics" - } - } - } - } - } - }, - "signal": { - "signal_name": "text_statistics" - }, - "enriched_path": [ - "question" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/wikitext-2-raw-v1/config.yml b/data/datasets/lilac/wikitext-2-raw-v1/config.yml deleted file mode 100644 index 4a736bcc67562349cb0e143a08e4f231a96d13f3..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/config.yml +++ /dev/null @@ -1,32 +0,0 @@ -embeddings: -- embedding: gte-small - path: text -name: wikitext-2-raw-v1 -namespace: local -settings: - preferred_embedding: gte-small - ui: - media_paths: - - text -signals: -- path: text - signal: - signal_name: pii -- path: text - signal: - signal_name: near_dup -- path: text - signal: - signal_name: lang_detection -- path: text - signal: - signal_name: text_statistics -- path: text - signal: - signal_name: spacy_ner -source: - config_name: wikitext-2-raw-v1 - dataset_name: wikitext - source_name: huggingface -tags: -- machine-learning diff --git a/data/datasets/lilac/wikitext-2-raw-v1/data-00000-of-00001.parquet b/data/datasets/lilac/wikitext-2-raw-v1/data-00000-of-00001.parquet deleted file mode 100644 index be71c28d7173f5876194c5e663b53d5f77bbfb73..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2d2722d7f4c6b75fdd39f4477d67ed046ab1da08f1fa6d757109d8df77a1a4c2 -size 9171984 diff --git a/data/datasets/lilac/wikitext-2-raw-v1/manifest.json b/data/datasets/lilac/wikitext-2-raw-v1/manifest.json deleted file mode 100644 index 6b0b5700fe6080dfedfee42ab57dc5c8f38051bb..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/manifest.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "data_schema": { - "fields": { - "text": { - "dtype": "string" - }, - "__hfsplit__": { - "dtype": "string" - }, - "__rowid__": { - "dtype": "string" - } - } - } -} \ No newline at end of file diff --git a/data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/hnsw.hnswlib.bin b/data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/hnsw.hnswlib.bin deleted file mode 100644 index c09f28fa576401bc02c3fa645050d661e5bb043c..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/hnsw.hnswlib.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c9c5174ef8444ec37b0cc39acb01225d77ef8e131d6af4501ac85e904d80a9b2 -size 91233592 diff --git a/data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/hnsw.lookup.pkl b/data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/hnsw.lookup.pkl deleted file mode 100644 index 16765de4ed616eaa17be110e0839b87c06c4bb67..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/hnsw.lookup.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c6de9264ae1f50b8dd78b4fec2601cd88625cfe253eaab24542890a870ecd900 -size 1848900 diff --git a/data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/signal_manifest.json b/data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/signal_manifest.json deleted file mode 100644 index 6c4df1308025f01afd7ad11c2e4eab36963c61b0..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/signal_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "files": [], - "parquet_id": "gte-small(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "gte-small": { - "repeated_field": { - "fields": { - "embedding": { - "dtype": "embedding" - } - }, - "dtype": "string_span" - }, - "signal": { - "signal_name": "gte-small" - } - } - } - } - } - }, - "signal": { - "signal_name": "gte-small" - }, - "enriched_path": [ - "text" - ], - "vector_store": "hnsw" -} \ No newline at end of file diff --git a/data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/spans.pkl b/data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/spans.pkl deleted file mode 100644 index 96bb1f5443116fd861dba7852016928328c0a046..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/spans.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:064d18ca7f0f3b021ebfa93323bcbe9f50271da2ea2234393581a8113322683a -size 1629720 diff --git a/data/datasets/lilac/wikitext-2-raw-v1/text/lang_detection/data-00000-of-00001.parquet b/data/datasets/lilac/wikitext-2-raw-v1/text/lang_detection/data-00000-of-00001.parquet deleted file mode 100644 index 52365e79c0276932e5a3c6cf4c49666f81cf3eb2..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/text/lang_detection/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:68e4d6290a46a7dac484299be15b8536324cb677ced9721934bd150a277ddaf9 -size 1532348 diff --git a/data/datasets/lilac/wikitext-2-raw-v1/text/lang_detection/signal_manifest.json b/data/datasets/lilac/wikitext-2-raw-v1/text/lang_detection/signal_manifest.json deleted file mode 100644 index 2163e9fd9535885346193cbb0e9772d01841cd31..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/text/lang_detection/signal_manifest.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "lang_detection(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "lang_detection": { - "dtype": "string", - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - } - } - } - } - } - }, - "signal": { - "split_by_paragraph": false, - "signal_name": "lang_detection" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/wikitext-2-raw-v1/text/near_dup/data-00000-of-00001.parquet b/data/datasets/lilac/wikitext-2-raw-v1/text/near_dup/data-00000-of-00001.parquet deleted file mode 100644 index 8a5403fb1b4148c5368424e97429d5ac0e97dcc5..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/text/near_dup/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3f316a8a3f031470cf9fa99475b81a04e962f2da7704d61de8a44089bbf7e59b -size 1703637 diff --git a/data/datasets/lilac/wikitext-2-raw-v1/text/near_dup/signal_manifest.json b/data/datasets/lilac/wikitext-2-raw-v1/text/near_dup/signal_manifest.json deleted file mode 100644 index 6febd3e69b9125b0be3b4778f7b65fab8a5d325b..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/text/near_dup/signal_manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "near_dup(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "near_dup": { - "fields": { - "cluster_id": { - "dtype": "uint32", - "categorical": true - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - } - } - } - } - } - }, - "signal": { - "threshold": 0.85, - "signal_name": "near_dup" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/wikitext-2-raw-v1/text/pii/data-00000-of-00001.parquet b/data/datasets/lilac/wikitext-2-raw-v1/text/pii/data-00000-of-00001.parquet deleted file mode 100644 index 8029f40ee89cf359dbcda728226e7ae4c33fc0fb..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/text/pii/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb257c0e65788f446b23d8f389db299c51a896baf0dcbd4d37114b625868f0ad -size 1517126 diff --git a/data/datasets/lilac/wikitext-2-raw-v1/text/pii/signal_manifest.json b/data/datasets/lilac/wikitext-2-raw-v1/text/pii/signal_manifest.json deleted file mode 100644 index ecb2e8b26fb5328004c7cd7b5363b79e88acf8ac..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/text/pii/signal_manifest.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "pii(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "pii": { - "fields": { - "emails": { - "repeated_field": { - "dtype": "string_span" - } - }, - "ip_addresses": { - "repeated_field": { - "dtype": "string_span" - } - }, - "secrets": { - "repeated_field": { - "dtype": "string_span" - } - } - }, - "signal": { - "signal_name": "pii" - } - } - } - } - } - }, - "signal": { - "signal_name": "pii" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/wikitext-2-raw-v1/text/spacy_ner/data-00000-of-00001.parquet b/data/datasets/lilac/wikitext-2-raw-v1/text/spacy_ner/data-00000-of-00001.parquet deleted file mode 100644 index 2a0b377b9882adce3bebee038f3414ab45d2d307..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/text/spacy_ner/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:84a98ab86da1750c73ce707415d10487541b2b3265844bc7d2d0a5a2eefe8d15 -size 2718467 diff --git a/data/datasets/lilac/wikitext-2-raw-v1/text/spacy_ner/signal_manifest.json b/data/datasets/lilac/wikitext-2-raw-v1/text/spacy_ner/signal_manifest.json deleted file mode 100644 index 56c8c0883131d3cf35156969931918e4ea2da3a6..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/text/spacy_ner/signal_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "spacy_ner(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "spacy_ner": { - "repeated_field": { - "fields": { - "label": { - "dtype": "string" - } - }, - "dtype": "string_span" - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - } - } - } - } - } - }, - "signal": { - "model": "en_core_web_sm", - "signal_name": "spacy_ner" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/data/datasets/lilac/wikitext-2-raw-v1/text/text_statistics/data-00000-of-00001.parquet b/data/datasets/lilac/wikitext-2-raw-v1/text/text_statistics/data-00000-of-00001.parquet deleted file mode 100644 index 5d0e5dedb36bcaa931a3dd1a30e4da9d9fdd67ed..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/text/text_statistics/data-00000-of-00001.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:96b6d56208f0c74e7c9bc1d858ebabaae941eaf2c95cdf9207a7bdd1418a59ce -size 1827010 diff --git a/data/datasets/lilac/wikitext-2-raw-v1/text/text_statistics/signal_manifest.json b/data/datasets/lilac/wikitext-2-raw-v1/text/text_statistics/signal_manifest.json deleted file mode 100644 index 642e3171e68afe580d2043a5f4a9f1331fc85208..0000000000000000000000000000000000000000 --- a/data/datasets/lilac/wikitext-2-raw-v1/text/text_statistics/signal_manifest.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "files": [ - "data-00000-of-00001.parquet" - ], - "parquet_id": "text_statistics(text)", - "data_schema": { - "fields": { - "__rowid__": { - "dtype": "string" - }, - "text": { - "fields": { - "text_statistics": { - "fields": { - "num_characters": { - "dtype": "int32" - }, - "readability": { - "dtype": "float32" - }, - "log(type_token_ratio)": { - "dtype": "float32" - }, - "frac_non_ascii": { - "dtype": "float32", - "bins": [ - [ - "Low", - null, - 0.15 - ], - [ - "Medium", - 0.15, - 0.3 - ], - [ - "High", - 0.3, - null - ] - ] - } - }, - "signal": { - "signal_name": "text_statistics" - } - } - } - } - } - }, - "signal": { - "signal_name": "text_statistics" - }, - "enriched_path": [ - "text" - ] -} \ No newline at end of file diff --git a/docker_start.py b/docker_start.py index dcb88f7b2b83fcce709812d2d06c8c4a79d5a311..eb28f5f9412e9dd9d45f2c0730efe6108960b9d0 100644 --- a/docker_start.py +++ b/docker_start.py @@ -14,7 +14,11 @@ from lilac.utils import get_dataset_output_dir, get_lilac_cache_dir, log def delete_old_files() -> None: """Delete old files from the cache.""" # Scan cache - scan = scan_cache_dir() + try: + scan = scan_cache_dir() + except BaseException: + # Cache was not found. + return # Select revisions to delete to_delete = [] @@ -48,12 +52,10 @@ def main() -> None: dataset.dataset_name) persistent_output_dir = get_dataset_output_dir(data_path(), dataset.namespace, dataset.dataset_name) - # Huggingface doesn't let you selectively download files so we just copy the data directory - # out of the cloned space. + # Make persistent_output_dir point to spaces_dataset_output_dir. We use a temp symlink + # to allow overwriting older symlinks. os.symlink(spaces_dataset_output_dir, '/data/tmp-link') os.rename('/data/tmp-link', persistent_output_dir) - # shutil.rmtree(persistent_output_dir, ignore_errors=True) - # shutil.copytree(spaces_dataset_output_dir, persistent_output_dir) # Delete cache files from persistent storage. cache_dir = get_lilac_cache_dir(data_path())