Spaces:
Running
Running
Push
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +0 -87
- Dockerfile +1 -1
- data/datasets/lilac/imdb/config.yml +0 -31
- data/datasets/lilac/imdb/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/imdb/manifest.json +0 -21
- data/datasets/lilac/imdb/text/gte-small/hnsw.hnswlib.bin +0 -3
- data/datasets/lilac/imdb/text/gte-small/hnsw.lookup.pkl +0 -3
- data/datasets/lilac/imdb/text/gte-small/signal_manifest.json +0 -35
- data/datasets/lilac/imdb/text/gte-small/spans.pkl +0 -3
- data/datasets/lilac/imdb/text/lang_detection/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/imdb/text/lang_detection/signal_manifest.json +0 -31
- data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/imdb/text/near_dup/signal_manifest.json +0 -36
- data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/imdb/text/pii/signal_manifest.json +0 -45
- data/datasets/lilac/imdb/text/spacy_ner/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/imdb/text/spacy_ner/signal_manifest.json +0 -38
- data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/imdb/text/text_statistics/signal_manifest.json +0 -59
- data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin +0 -3
- data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl +0 -3
- data/datasets/lilac/mmlu_professional_law/choices/gte-small/signal_manifest.json +0 -38
- data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl +0 -3
- data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/mmlu_professional_law/choices/lang_detection/signal_manifest.json +0 -34
- data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/mmlu_professional_law/choices/near_dup/signal_manifest.json +0 -39
- data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/mmlu_professional_law/choices/pii/signal_manifest.json +0 -48
- data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/signal_manifest.json +0 -41
- data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/mmlu_professional_law/choices/text_statistics/signal_manifest.json +0 -62
- data/datasets/lilac/mmlu_professional_law/config.yml +0 -63
- data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/mmlu_professional_law/manifest.json +0 -26
- data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin +0 -3
- data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl +0 -3
- data/datasets/lilac/mmlu_professional_law/question/gte-small/signal_manifest.json +0 -35
- data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl +0 -3
- data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/mmlu_professional_law/question/lang_detection/signal_manifest.json +0 -31
- data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/mmlu_professional_law/question/near_dup/signal_manifest.json +0 -36
- data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/mmlu_professional_law/question/pii/signal_manifest.json +0 -45
- data/datasets/lilac/mmlu_professional_law/question/spacy_ner/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/mmlu_professional_law/question/spacy_ner/signal_manifest.json +0 -38
- data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet +0 -3
- data/datasets/lilac/mmlu_professional_law/question/text_statistics/signal_manifest.json +0 -59
.gitattributes
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
data/datasets/lilac/pile-of-law-constitutions/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
2 |
-
data/datasets/lilac/pile-of-law-constitutions/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
3 |
-
data/datasets/lilac/pile-of-law-constitutions/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
4 |
-
data/datasets/lilac/pile-of-law-constitutions/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
5 |
-
data/datasets/lilac/piqa/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
6 |
-
data/datasets/lilac/piqa/goal/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
7 |
-
data/datasets/lilac/piqa/goal/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
8 |
-
data/datasets/lilac/piqa/goal/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
9 |
-
data/datasets/lilac/piqa/sol1/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
10 |
-
data/datasets/lilac/piqa/sol1/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
11 |
-
data/datasets/lilac/piqa/sol1/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
12 |
-
data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
13 |
-
data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
14 |
-
data/datasets/lilac/piqa/sol2/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
15 |
-
data/datasets/lilac/pile-of-law-atticus-contracts/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
16 |
-
data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
17 |
-
data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
18 |
-
data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
19 |
-
data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
20 |
-
data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
21 |
-
data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
22 |
-
data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
23 |
-
data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
24 |
-
data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
25 |
-
data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
26 |
-
data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
27 |
-
data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
28 |
-
data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
29 |
-
data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
30 |
-
data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
31 |
-
data/datasets/lilac/mmlu_professional_law/question/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
32 |
-
data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
33 |
-
data/datasets/lilac/pile-of-law-r-legaladvice/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
34 |
-
data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
35 |
-
data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
36 |
-
data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
37 |
-
data/datasets/lilac/pile-of-law-r-legaladvice/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
38 |
-
data/datasets/lilac/pile-of-law-r-legaladvice/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
39 |
-
data/datasets/lilac/pile-of-law-r-legaladvice/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
40 |
-
data/datasets/lilac/pile-of-law-r-legaladvice/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
41 |
-
data/datasets/lilac/pile-of-law-r-legaladvice/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
42 |
-
data/datasets/lilac/open-asssistant-conversations/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
43 |
-
data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
44 |
-
data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
45 |
-
data/datasets/lilac/open-asssistant-conversations/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
46 |
-
data/datasets/lilac/open-asssistant-conversations/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
47 |
-
data/datasets/lilac/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
48 |
-
data/datasets/lilac/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
49 |
-
data/datasets/lilac/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
50 |
-
data/datasets/lilac/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
51 |
-
data/datasets/lilac/squad_v2/answers/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
52 |
-
data/datasets/lilac/squad_v2/answers/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
53 |
-
data/datasets/lilac/squad_v2/answers/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
54 |
-
data/datasets/lilac/squad_v2/answers/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
55 |
-
data/datasets/lilac/squad_v2/answers/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
56 |
-
data/datasets/lilac/squad_v2/context/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
57 |
-
data/datasets/lilac/squad_v2/context/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
58 |
-
data/datasets/lilac/squad_v2/context/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
59 |
-
data/datasets/lilac/squad_v2/context/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
60 |
-
data/datasets/lilac/squad_v2/context/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
61 |
-
data/datasets/lilac/squad_v2/context/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
62 |
-
data/datasets/lilac/squad_v2/context/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
63 |
-
data/datasets/lilac/squad_v2/context/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
64 |
-
data/datasets/lilac/squad_v2/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
65 |
-
data/datasets/lilac/squad_v2/question/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
66 |
-
data/datasets/lilac/squad_v2/question/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
67 |
-
data/datasets/lilac/squad_v2/question/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
68 |
-
data/datasets/lilac/squad_v2/question/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
69 |
-
data/datasets/lilac/squad_v2/question/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
70 |
-
data/datasets/lilac/imdb/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
71 |
-
data/datasets/lilac/imdb/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
72 |
-
data/datasets/lilac/imdb/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
73 |
-
data/datasets/lilac/imdb/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
74 |
-
data/datasets/lilac/imdb/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
75 |
-
data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
76 |
-
data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
77 |
-
data/datasets/lilac/imdb/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
78 |
-
data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
79 |
-
data/datasets/lilac/wikitext-2-raw-v1/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
80 |
-
data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
81 |
-
data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
82 |
-
data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
83 |
-
data/datasets/lilac/wikitext-2-raw-v1/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
84 |
-
data/datasets/lilac/wikitext-2-raw-v1/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
85 |
-
data/datasets/lilac/wikitext-2-raw-v1/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
86 |
-
data/datasets/lilac/wikitext-2-raw-v1/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
87 |
-
data/datasets/lilac/wikitext-2-raw-v1/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dockerfile
CHANGED
@@ -21,4 +21,4 @@ COPY /lilac ./lilac/
|
|
21 |
|
22 |
COPY docker_start.sh docker_start.py ./
|
23 |
|
24 |
-
CMD
|
|
|
21 |
|
22 |
COPY docker_start.sh docker_start.py ./
|
23 |
|
24 |
+
CMD ["bash", "docker_start.sh"]
|
data/datasets/lilac/imdb/config.yml
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
embeddings:
|
2 |
-
- embedding: gte-small
|
3 |
-
path: text
|
4 |
-
name: imdb
|
5 |
-
namespace: local
|
6 |
-
settings:
|
7 |
-
preferred_embedding: gte-small
|
8 |
-
ui:
|
9 |
-
media_paths:
|
10 |
-
- text
|
11 |
-
signals:
|
12 |
-
- path: text
|
13 |
-
signal:
|
14 |
-
signal_name: near_dup
|
15 |
-
- path: text
|
16 |
-
signal:
|
17 |
-
signal_name: text_statistics
|
18 |
-
- path: text
|
19 |
-
signal:
|
20 |
-
signal_name: lang_detection
|
21 |
-
- path: text
|
22 |
-
signal:
|
23 |
-
signal_name: spacy_ner
|
24 |
-
- path: text
|
25 |
-
signal:
|
26 |
-
signal_name: pii
|
27 |
-
source:
|
28 |
-
dataset_name: imdb
|
29 |
-
source_name: huggingface
|
30 |
-
tags:
|
31 |
-
- machine-learning
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/imdb/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:6fe90be23f86ca1e73b8a77a235344db822601f794a5643dca9d0d07c49ce3d8
|
3 |
-
size 86160450
|
|
|
|
|
|
|
|
data/datasets/lilac/imdb/manifest.json
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"data_schema": {
|
6 |
-
"fields": {
|
7 |
-
"text": {
|
8 |
-
"dtype": "string"
|
9 |
-
},
|
10 |
-
"label": {
|
11 |
-
"dtype": "string"
|
12 |
-
},
|
13 |
-
"__hfsplit__": {
|
14 |
-
"dtype": "string"
|
15 |
-
},
|
16 |
-
"__rowid__": {
|
17 |
-
"dtype": "string"
|
18 |
-
}
|
19 |
-
}
|
20 |
-
}
|
21 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/imdb/text/gte-small/hnsw.hnswlib.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:7b8fc440b947f068966bc68c83ce4b5502b98e3aab928d58c295dc8a7b7b016c
|
3 |
-
size 691432396
|
|
|
|
|
|
|
|
data/datasets/lilac/imdb/text/gte-small/hnsw.lookup.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d3ee2426c761b50025e02f2388fa65c51eb9705d6fd65f95d6502650214b8472
|
3 |
-
size 10390867
|
|
|
|
|
|
|
|
data/datasets/lilac/imdb/text/gte-small/signal_manifest.json
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [],
|
3 |
-
"parquet_id": "gte-small(text)",
|
4 |
-
"data_schema": {
|
5 |
-
"fields": {
|
6 |
-
"__rowid__": {
|
7 |
-
"dtype": "string"
|
8 |
-
},
|
9 |
-
"text": {
|
10 |
-
"fields": {
|
11 |
-
"gte-small": {
|
12 |
-
"repeated_field": {
|
13 |
-
"fields": {
|
14 |
-
"embedding": {
|
15 |
-
"dtype": "embedding"
|
16 |
-
}
|
17 |
-
},
|
18 |
-
"dtype": "string_span"
|
19 |
-
},
|
20 |
-
"signal": {
|
21 |
-
"signal_name": "gte-small"
|
22 |
-
}
|
23 |
-
}
|
24 |
-
}
|
25 |
-
}
|
26 |
-
}
|
27 |
-
},
|
28 |
-
"signal": {
|
29 |
-
"signal_name": "gte-small"
|
30 |
-
},
|
31 |
-
"enriched_path": [
|
32 |
-
"text"
|
33 |
-
],
|
34 |
-
"vector_store": "hnsw"
|
35 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/imdb/text/gte-small/spans.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:16417449057fc4304e098e037cb2b3e9a693570768a68bdca457d452adaee130
|
3 |
-
size 7476546
|
|
|
|
|
|
|
|
data/datasets/lilac/imdb/text/lang_detection/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:adba6df16ec47b68625618b01a0b8d2cc65de572acb20eed561128b037fcdfd7
|
3 |
-
size 3309315
|
|
|
|
|
|
|
|
data/datasets/lilac/imdb/text/lang_detection/signal_manifest.json
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "lang_detection(text)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"text": {
|
12 |
-
"fields": {
|
13 |
-
"lang_detection": {
|
14 |
-
"dtype": "string",
|
15 |
-
"signal": {
|
16 |
-
"split_by_paragraph": false,
|
17 |
-
"signal_name": "lang_detection"
|
18 |
-
}
|
19 |
-
}
|
20 |
-
}
|
21 |
-
}
|
22 |
-
}
|
23 |
-
},
|
24 |
-
"signal": {
|
25 |
-
"split_by_paragraph": false,
|
26 |
-
"signal_name": "lang_detection"
|
27 |
-
},
|
28 |
-
"enriched_path": [
|
29 |
-
"text"
|
30 |
-
]
|
31 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:0c75a5f0d12b8b02671e99a2dd313d292610c9264f2302ad1877959430420079
|
3 |
-
size 3915752
|
|
|
|
|
|
|
|
data/datasets/lilac/imdb/text/near_dup/signal_manifest.json
DELETED
@@ -1,36 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "near_dup(text)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"text": {
|
12 |
-
"fields": {
|
13 |
-
"near_dup": {
|
14 |
-
"fields": {
|
15 |
-
"cluster_id": {
|
16 |
-
"dtype": "uint32",
|
17 |
-
"categorical": true
|
18 |
-
}
|
19 |
-
},
|
20 |
-
"signal": {
|
21 |
-
"threshold": 0.85,
|
22 |
-
"signal_name": "near_dup"
|
23 |
-
}
|
24 |
-
}
|
25 |
-
}
|
26 |
-
}
|
27 |
-
}
|
28 |
-
},
|
29 |
-
"signal": {
|
30 |
-
"threshold": 0.85,
|
31 |
-
"signal_name": "near_dup"
|
32 |
-
},
|
33 |
-
"enriched_path": [
|
34 |
-
"text"
|
35 |
-
]
|
36 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:2dfe71845bfd419a2a309b132456341bf317d63e8ccf8bc100835f1a20c81c5b
|
3 |
-
size 3313701
|
|
|
|
|
|
|
|
data/datasets/lilac/imdb/text/pii/signal_manifest.json
DELETED
@@ -1,45 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "pii(text)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"text": {
|
12 |
-
"fields": {
|
13 |
-
"pii": {
|
14 |
-
"fields": {
|
15 |
-
"emails": {
|
16 |
-
"repeated_field": {
|
17 |
-
"dtype": "string_span"
|
18 |
-
}
|
19 |
-
},
|
20 |
-
"ip_addresses": {
|
21 |
-
"repeated_field": {
|
22 |
-
"dtype": "string_span"
|
23 |
-
}
|
24 |
-
},
|
25 |
-
"secrets": {
|
26 |
-
"repeated_field": {
|
27 |
-
"dtype": "string_span"
|
28 |
-
}
|
29 |
-
}
|
30 |
-
},
|
31 |
-
"signal": {
|
32 |
-
"signal_name": "pii"
|
33 |
-
}
|
34 |
-
}
|
35 |
-
}
|
36 |
-
}
|
37 |
-
}
|
38 |
-
},
|
39 |
-
"signal": {
|
40 |
-
"signal_name": "pii"
|
41 |
-
},
|
42 |
-
"enriched_path": [
|
43 |
-
"text"
|
44 |
-
]
|
45 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/imdb/text/spacy_ner/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:be39eb80c981da64a2ec5f7fe42c43ffb390b887a220e278a58cdde04e7824ef
|
3 |
-
size 8479478
|
|
|
|
|
|
|
|
data/datasets/lilac/imdb/text/spacy_ner/signal_manifest.json
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "spacy_ner(text)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"text": {
|
12 |
-
"fields": {
|
13 |
-
"spacy_ner": {
|
14 |
-
"repeated_field": {
|
15 |
-
"fields": {
|
16 |
-
"label": {
|
17 |
-
"dtype": "string"
|
18 |
-
}
|
19 |
-
},
|
20 |
-
"dtype": "string_span"
|
21 |
-
},
|
22 |
-
"signal": {
|
23 |
-
"model": "en_core_web_sm",
|
24 |
-
"signal_name": "spacy_ner"
|
25 |
-
}
|
26 |
-
}
|
27 |
-
}
|
28 |
-
}
|
29 |
-
}
|
30 |
-
},
|
31 |
-
"signal": {
|
32 |
-
"model": "en_core_web_sm",
|
33 |
-
"signal_name": "spacy_ner"
|
34 |
-
},
|
35 |
-
"enriched_path": [
|
36 |
-
"text"
|
37 |
-
]
|
38 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:4919817d0c520fc5aacc9721f01a0e29fb4d2c89b6698865c9680c81c44ce920
|
3 |
-
size 4403809
|
|
|
|
|
|
|
|
data/datasets/lilac/imdb/text/text_statistics/signal_manifest.json
DELETED
@@ -1,59 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "text_statistics(text)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"text": {
|
12 |
-
"fields": {
|
13 |
-
"text_statistics": {
|
14 |
-
"fields": {
|
15 |
-
"num_characters": {
|
16 |
-
"dtype": "int32"
|
17 |
-
},
|
18 |
-
"readability": {
|
19 |
-
"dtype": "float32"
|
20 |
-
},
|
21 |
-
"log(type_token_ratio)": {
|
22 |
-
"dtype": "float32"
|
23 |
-
},
|
24 |
-
"frac_non_ascii": {
|
25 |
-
"dtype": "float32",
|
26 |
-
"bins": [
|
27 |
-
[
|
28 |
-
"Low",
|
29 |
-
null,
|
30 |
-
0.15
|
31 |
-
],
|
32 |
-
[
|
33 |
-
"Medium",
|
34 |
-
0.15,
|
35 |
-
0.3
|
36 |
-
],
|
37 |
-
[
|
38 |
-
"High",
|
39 |
-
0.3,
|
40 |
-
null
|
41 |
-
]
|
42 |
-
]
|
43 |
-
}
|
44 |
-
},
|
45 |
-
"signal": {
|
46 |
-
"signal_name": "text_statistics"
|
47 |
-
}
|
48 |
-
}
|
49 |
-
}
|
50 |
-
}
|
51 |
-
}
|
52 |
-
},
|
53 |
-
"signal": {
|
54 |
-
"signal_name": "text_statistics"
|
55 |
-
},
|
56 |
-
"enriched_path": [
|
57 |
-
"text"
|
58 |
-
]
|
59 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:df9d6e2f5df4b8693544f31ca78a9d1936a4caf47acc2babeb1cb766131b7636
|
3 |
-
size 684360968
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:2081ce5d760026fe341e0553cd9e40747ca902e4e7edb851cb747f350f19bb0d
|
3 |
-
size 11174465
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/choices/gte-small/signal_manifest.json
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [],
|
3 |
-
"parquet_id": "gte-small(choices)",
|
4 |
-
"data_schema": {
|
5 |
-
"fields": {
|
6 |
-
"__rowid__": {
|
7 |
-
"dtype": "string"
|
8 |
-
},
|
9 |
-
"choices": {
|
10 |
-
"repeated_field": {
|
11 |
-
"fields": {
|
12 |
-
"gte-small": {
|
13 |
-
"repeated_field": {
|
14 |
-
"fields": {
|
15 |
-
"embedding": {
|
16 |
-
"dtype": "embedding"
|
17 |
-
}
|
18 |
-
},
|
19 |
-
"dtype": "string_span"
|
20 |
-
},
|
21 |
-
"signal": {
|
22 |
-
"signal_name": "gte-small"
|
23 |
-
}
|
24 |
-
}
|
25 |
-
}
|
26 |
-
}
|
27 |
-
}
|
28 |
-
}
|
29 |
-
},
|
30 |
-
"signal": {
|
31 |
-
"signal_name": "gte-small"
|
32 |
-
},
|
33 |
-
"enriched_path": [
|
34 |
-
"choices",
|
35 |
-
"*"
|
36 |
-
],
|
37 |
-
"vector_store": "hnsw"
|
38 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:02fb1662da21f33ea1429a0f9adf1301185da46f642a722717fe7c523314fa57
|
3 |
-
size 11173475
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:968d4f87c7b51b995d9e3a96423a06b91984e5ee4a47062cd53fe87cca5cafbe
|
3 |
-
size 3469413
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/choices/lang_detection/signal_manifest.json
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "lang_detection(choices)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"choices": {
|
12 |
-
"repeated_field": {
|
13 |
-
"fields": {
|
14 |
-
"lang_detection": {
|
15 |
-
"dtype": "string",
|
16 |
-
"signal": {
|
17 |
-
"split_by_paragraph": false,
|
18 |
-
"signal_name": "lang_detection"
|
19 |
-
}
|
20 |
-
}
|
21 |
-
}
|
22 |
-
}
|
23 |
-
}
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"signal": {
|
27 |
-
"split_by_paragraph": false,
|
28 |
-
"signal_name": "lang_detection"
|
29 |
-
},
|
30 |
-
"enriched_path": [
|
31 |
-
"choices",
|
32 |
-
"*"
|
33 |
-
]
|
34 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:316f3be499fcbb960bc1e83a201838ca0b3047a71d8e1c302fe4e0d833a3bf90
|
3 |
-
size 5544176
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/choices/near_dup/signal_manifest.json
DELETED
@@ -1,39 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "near_dup(choices)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"choices": {
|
12 |
-
"repeated_field": {
|
13 |
-
"fields": {
|
14 |
-
"near_dup": {
|
15 |
-
"fields": {
|
16 |
-
"cluster_id": {
|
17 |
-
"dtype": "uint32",
|
18 |
-
"categorical": true
|
19 |
-
}
|
20 |
-
},
|
21 |
-
"signal": {
|
22 |
-
"threshold": 0.85,
|
23 |
-
"signal_name": "near_dup"
|
24 |
-
}
|
25 |
-
}
|
26 |
-
}
|
27 |
-
}
|
28 |
-
}
|
29 |
-
}
|
30 |
-
},
|
31 |
-
"signal": {
|
32 |
-
"threshold": 0.85,
|
33 |
-
"signal_name": "near_dup"
|
34 |
-
},
|
35 |
-
"enriched_path": [
|
36 |
-
"choices",
|
37 |
-
"*"
|
38 |
-
]
|
39 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:7cb41d4e9d0d82bd824abfa733d5be3a599e011098c5d41ebadeb1166a15f722
|
3 |
-
size 3393096
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/choices/pii/signal_manifest.json
DELETED
@@ -1,48 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "pii(choices)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"choices": {
|
12 |
-
"repeated_field": {
|
13 |
-
"fields": {
|
14 |
-
"pii": {
|
15 |
-
"fields": {
|
16 |
-
"emails": {
|
17 |
-
"repeated_field": {
|
18 |
-
"dtype": "string_span"
|
19 |
-
}
|
20 |
-
},
|
21 |
-
"ip_addresses": {
|
22 |
-
"repeated_field": {
|
23 |
-
"dtype": "string_span"
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"secrets": {
|
27 |
-
"repeated_field": {
|
28 |
-
"dtype": "string_span"
|
29 |
-
}
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"signal": {
|
33 |
-
"signal_name": "pii"
|
34 |
-
}
|
35 |
-
}
|
36 |
-
}
|
37 |
-
}
|
38 |
-
}
|
39 |
-
}
|
40 |
-
},
|
41 |
-
"signal": {
|
42 |
-
"signal_name": "pii"
|
43 |
-
},
|
44 |
-
"enriched_path": [
|
45 |
-
"choices",
|
46 |
-
"*"
|
47 |
-
]
|
48 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:4b1255490f17c64f88b8b332c7df30060df612b9de11b17aaf6f70234c363e1e
|
3 |
-
size 4080744
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/choices/spacy_ner/signal_manifest.json
DELETED
@@ -1,41 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "spacy_ner(choices)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"choices": {
|
12 |
-
"repeated_field": {
|
13 |
-
"fields": {
|
14 |
-
"spacy_ner": {
|
15 |
-
"repeated_field": {
|
16 |
-
"fields": {
|
17 |
-
"label": {
|
18 |
-
"dtype": "string"
|
19 |
-
}
|
20 |
-
},
|
21 |
-
"dtype": "string_span"
|
22 |
-
},
|
23 |
-
"signal": {
|
24 |
-
"model": "en_core_web_sm",
|
25 |
-
"signal_name": "spacy_ner"
|
26 |
-
}
|
27 |
-
}
|
28 |
-
}
|
29 |
-
}
|
30 |
-
}
|
31 |
-
}
|
32 |
-
},
|
33 |
-
"signal": {
|
34 |
-
"model": "en_core_web_sm",
|
35 |
-
"signal_name": "spacy_ner"
|
36 |
-
},
|
37 |
-
"enriched_path": [
|
38 |
-
"choices",
|
39 |
-
"*"
|
40 |
-
]
|
41 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:bc00a68e0f835e25b214d90e7e48251b39d748f1e836af713440cd0ea2517ead
|
3 |
-
size 4634821
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/choices/text_statistics/signal_manifest.json
DELETED
@@ -1,62 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "text_statistics(choices)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"choices": {
|
12 |
-
"repeated_field": {
|
13 |
-
"fields": {
|
14 |
-
"text_statistics": {
|
15 |
-
"fields": {
|
16 |
-
"num_characters": {
|
17 |
-
"dtype": "int32"
|
18 |
-
},
|
19 |
-
"readability": {
|
20 |
-
"dtype": "float32"
|
21 |
-
},
|
22 |
-
"log(type_token_ratio)": {
|
23 |
-
"dtype": "float32"
|
24 |
-
},
|
25 |
-
"frac_non_ascii": {
|
26 |
-
"dtype": "float32",
|
27 |
-
"bins": [
|
28 |
-
[
|
29 |
-
"Low",
|
30 |
-
null,
|
31 |
-
0.15
|
32 |
-
],
|
33 |
-
[
|
34 |
-
"Medium",
|
35 |
-
0.15,
|
36 |
-
0.3
|
37 |
-
],
|
38 |
-
[
|
39 |
-
"High",
|
40 |
-
0.3,
|
41 |
-
null
|
42 |
-
]
|
43 |
-
]
|
44 |
-
}
|
45 |
-
},
|
46 |
-
"signal": {
|
47 |
-
"signal_name": "text_statistics"
|
48 |
-
}
|
49 |
-
}
|
50 |
-
}
|
51 |
-
}
|
52 |
-
}
|
53 |
-
}
|
54 |
-
},
|
55 |
-
"signal": {
|
56 |
-
"signal_name": "text_statistics"
|
57 |
-
},
|
58 |
-
"enriched_path": [
|
59 |
-
"choices",
|
60 |
-
"*"
|
61 |
-
]
|
62 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/config.yml
DELETED
@@ -1,63 +0,0 @@
|
|
1 |
-
embeddings:
|
2 |
-
- embedding: gte-small
|
3 |
-
path:
|
4 |
-
- choices
|
5 |
-
- '*'
|
6 |
-
- embedding: gte-small
|
7 |
-
path: question
|
8 |
-
name: mmlu_professional_law
|
9 |
-
namespace: local
|
10 |
-
settings:
|
11 |
-
preferred_embedding: gte-small
|
12 |
-
ui:
|
13 |
-
media_paths:
|
14 |
-
- question
|
15 |
-
- - choices
|
16 |
-
- '*'
|
17 |
-
signals:
|
18 |
-
- path: question
|
19 |
-
signal:
|
20 |
-
signal_name: text_statistics
|
21 |
-
- path: question
|
22 |
-
signal:
|
23 |
-
signal_name: pii
|
24 |
-
- path: question
|
25 |
-
signal:
|
26 |
-
signal_name: near_dup
|
27 |
-
- path:
|
28 |
-
- choices
|
29 |
-
- '*'
|
30 |
-
signal:
|
31 |
-
signal_name: text_statistics
|
32 |
-
- path:
|
33 |
-
- choices
|
34 |
-
- '*'
|
35 |
-
signal:
|
36 |
-
signal_name: spacy_ner
|
37 |
-
- path: question
|
38 |
-
signal:
|
39 |
-
signal_name: lang_detection
|
40 |
-
- path:
|
41 |
-
- choices
|
42 |
-
- '*'
|
43 |
-
signal:
|
44 |
-
signal_name: near_dup
|
45 |
-
- path:
|
46 |
-
- choices
|
47 |
-
- '*'
|
48 |
-
signal:
|
49 |
-
signal_name: pii
|
50 |
-
- path:
|
51 |
-
- choices
|
52 |
-
- '*'
|
53 |
-
signal:
|
54 |
-
signal_name: lang_detection
|
55 |
-
- path: question
|
56 |
-
signal:
|
57 |
-
signal_name: spacy_ner
|
58 |
-
source:
|
59 |
-
config_name: professional_law
|
60 |
-
dataset_name: cais/mmlu
|
61 |
-
source_name: huggingface
|
62 |
-
tags:
|
63 |
-
- legal
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:65cd2771cf0bb88dbed9ad66ceaff472115f07c9dfea866c7e3f65b68392e745
|
3 |
-
size 50699938
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/manifest.json
DELETED
@@ -1,26 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"data_schema": {
|
6 |
-
"fields": {
|
7 |
-
"question": {
|
8 |
-
"dtype": "string"
|
9 |
-
},
|
10 |
-
"choices": {
|
11 |
-
"repeated_field": {
|
12 |
-
"dtype": "string"
|
13 |
-
}
|
14 |
-
},
|
15 |
-
"answer": {
|
16 |
-
"dtype": "string"
|
17 |
-
},
|
18 |
-
"__hfsplit__": {
|
19 |
-
"dtype": "string"
|
20 |
-
},
|
21 |
-
"__rowid__": {
|
22 |
-
"dtype": "string"
|
23 |
-
}
|
24 |
-
}
|
25 |
-
}
|
26 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:3b02300405fccc3011294e15ee869933dd81578173435defbcb19e3b40a65e93
|
3 |
-
size 771802212
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:f72169740d80ee2b2ea66589d7ebcc58c83381978a4640a27510c416a02bf6c7
|
3 |
-
size 11296648
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/question/gte-small/signal_manifest.json
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [],
|
3 |
-
"parquet_id": "gte-small(question)",
|
4 |
-
"data_schema": {
|
5 |
-
"fields": {
|
6 |
-
"__rowid__": {
|
7 |
-
"dtype": "string"
|
8 |
-
},
|
9 |
-
"question": {
|
10 |
-
"fields": {
|
11 |
-
"gte-small": {
|
12 |
-
"repeated_field": {
|
13 |
-
"fields": {
|
14 |
-
"embedding": {
|
15 |
-
"dtype": "embedding"
|
16 |
-
}
|
17 |
-
},
|
18 |
-
"dtype": "string_span"
|
19 |
-
},
|
20 |
-
"signal": {
|
21 |
-
"signal_name": "gte-small"
|
22 |
-
}
|
23 |
-
}
|
24 |
-
}
|
25 |
-
}
|
26 |
-
}
|
27 |
-
},
|
28 |
-
"signal": {
|
29 |
-
"signal_name": "gte-small"
|
30 |
-
},
|
31 |
-
"enriched_path": [
|
32 |
-
"question"
|
33 |
-
],
|
34 |
-
"vector_store": "hnsw"
|
35 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:9b51cad455e94b167bc9cf130c262ed1b143a8f386c7074a61983e01cd93d277
|
3 |
-
size 7911602
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:bf6cf8cdc246ce4406599aec8782d3be02f2585f1fbad74173faf0ffcb453a49
|
3 |
-
size 3361922
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/question/lang_detection/signal_manifest.json
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "lang_detection(question)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"question": {
|
12 |
-
"fields": {
|
13 |
-
"lang_detection": {
|
14 |
-
"dtype": "string",
|
15 |
-
"signal": {
|
16 |
-
"split_by_paragraph": false,
|
17 |
-
"signal_name": "lang_detection"
|
18 |
-
}
|
19 |
-
}
|
20 |
-
}
|
21 |
-
}
|
22 |
-
}
|
23 |
-
},
|
24 |
-
"signal": {
|
25 |
-
"split_by_paragraph": false,
|
26 |
-
"signal_name": "lang_detection"
|
27 |
-
},
|
28 |
-
"enriched_path": [
|
29 |
-
"question"
|
30 |
-
]
|
31 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:2c4139f699d1a248cf5378c442ef6f17970913394d5d0c79bd7c6e6801ab548a
|
3 |
-
size 3697516
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/question/near_dup/signal_manifest.json
DELETED
@@ -1,36 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "near_dup(question)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"question": {
|
12 |
-
"fields": {
|
13 |
-
"near_dup": {
|
14 |
-
"fields": {
|
15 |
-
"cluster_id": {
|
16 |
-
"dtype": "uint32",
|
17 |
-
"categorical": true
|
18 |
-
}
|
19 |
-
},
|
20 |
-
"signal": {
|
21 |
-
"threshold": 0.85,
|
22 |
-
"signal_name": "near_dup"
|
23 |
-
}
|
24 |
-
}
|
25 |
-
}
|
26 |
-
}
|
27 |
-
}
|
28 |
-
},
|
29 |
-
"signal": {
|
30 |
-
"threshold": 0.85,
|
31 |
-
"signal_name": "near_dup"
|
32 |
-
},
|
33 |
-
"enriched_path": [
|
34 |
-
"question"
|
35 |
-
]
|
36 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:2735c4a2c5d40973652d369140533af74425db6dd753f8a25850d4efeee4928e
|
3 |
-
size 3369080
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/question/pii/signal_manifest.json
DELETED
@@ -1,45 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "pii(question)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"question": {
|
12 |
-
"fields": {
|
13 |
-
"pii": {
|
14 |
-
"fields": {
|
15 |
-
"emails": {
|
16 |
-
"repeated_field": {
|
17 |
-
"dtype": "string_span"
|
18 |
-
}
|
19 |
-
},
|
20 |
-
"ip_addresses": {
|
21 |
-
"repeated_field": {
|
22 |
-
"dtype": "string_span"
|
23 |
-
}
|
24 |
-
},
|
25 |
-
"secrets": {
|
26 |
-
"repeated_field": {
|
27 |
-
"dtype": "string_span"
|
28 |
-
}
|
29 |
-
}
|
30 |
-
},
|
31 |
-
"signal": {
|
32 |
-
"signal_name": "pii"
|
33 |
-
}
|
34 |
-
}
|
35 |
-
}
|
36 |
-
}
|
37 |
-
}
|
38 |
-
},
|
39 |
-
"signal": {
|
40 |
-
"signal_name": "pii"
|
41 |
-
},
|
42 |
-
"enriched_path": [
|
43 |
-
"question"
|
44 |
-
]
|
45 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/question/spacy_ner/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:e775b663f9a3b7c7ebdd31f9a860254dec31c18aa46c5a61820050d0556cbb0f
|
3 |
-
size 9105982
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/question/spacy_ner/signal_manifest.json
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "spacy_ner(question)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"question": {
|
12 |
-
"fields": {
|
13 |
-
"spacy_ner": {
|
14 |
-
"repeated_field": {
|
15 |
-
"fields": {
|
16 |
-
"label": {
|
17 |
-
"dtype": "string"
|
18 |
-
}
|
19 |
-
},
|
20 |
-
"dtype": "string_span"
|
21 |
-
},
|
22 |
-
"signal": {
|
23 |
-
"model": "en_core_web_sm",
|
24 |
-
"signal_name": "spacy_ner"
|
25 |
-
}
|
26 |
-
}
|
27 |
-
}
|
28 |
-
}
|
29 |
-
}
|
30 |
-
},
|
31 |
-
"signal": {
|
32 |
-
"model": "en_core_web_sm",
|
33 |
-
"signal_name": "spacy_ner"
|
34 |
-
},
|
35 |
-
"enriched_path": [
|
36 |
-
"question"
|
37 |
-
]
|
38 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:995b3ac42907ea244d9cb04c68a4715af8ddb7d72dcced056bc58dc9a9f05e7e
|
3 |
-
size 4389031
|
|
|
|
|
|
|
|
data/datasets/lilac/mmlu_professional_law/question/text_statistics/signal_manifest.json
DELETED
@@ -1,59 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "text_statistics(question)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"question": {
|
12 |
-
"fields": {
|
13 |
-
"text_statistics": {
|
14 |
-
"fields": {
|
15 |
-
"num_characters": {
|
16 |
-
"dtype": "int32"
|
17 |
-
},
|
18 |
-
"readability": {
|
19 |
-
"dtype": "float32"
|
20 |
-
},
|
21 |
-
"log(type_token_ratio)": {
|
22 |
-
"dtype": "float32"
|
23 |
-
},
|
24 |
-
"frac_non_ascii": {
|
25 |
-
"dtype": "float32",
|
26 |
-
"bins": [
|
27 |
-
[
|
28 |
-
"Low",
|
29 |
-
null,
|
30 |
-
0.15
|
31 |
-
],
|
32 |
-
[
|
33 |
-
"Medium",
|
34 |
-
0.15,
|
35 |
-
0.3
|
36 |
-
],
|
37 |
-
[
|
38 |
-
"High",
|
39 |
-
0.3,
|
40 |
-
null
|
41 |
-
]
|
42 |
-
]
|
43 |
-
}
|
44 |
-
},
|
45 |
-
"signal": {
|
46 |
-
"signal_name": "text_statistics"
|
47 |
-
}
|
48 |
-
}
|
49 |
-
}
|
50 |
-
}
|
51 |
-
}
|
52 |
-
},
|
53 |
-
"signal": {
|
54 |
-
"signal_name": "text_statistics"
|
55 |
-
},
|
56 |
-
"enriched_path": [
|
57 |
-
"question"
|
58 |
-
]
|
59 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|