Spaces:
Running
Running
Push
Browse files- .gitattributes +0 -18
- README.md +1 -1
- data/datasets/local/imdb/data-00000-of-00001.parquet +0 -3
- data/datasets/local/imdb/manifest.json +0 -21
- data/datasets/local/imdb/settings.json +0 -1
- data/datasets/local/imdb/text/gte-small/hnsw.hnswlib.bin +0 -3
- data/datasets/local/imdb/text/gte-small/hnsw.lookup.pkl +0 -3
- data/datasets/local/imdb/text/gte-small/signal_manifest.json +0 -35
- data/datasets/local/imdb/text/gte-small/spans.pkl +0 -3
- data/datasets/local/imdb/text/lang_detection(split_by_paragraph=True)/data-00000-of-00001.parquet +0 -3
- data/datasets/local/imdb/text/lang_detection(split_by_paragraph=True)/signal_manifest.json +0 -38
- data/datasets/local/imdb/text/near_dup/data-00000-of-00001.parquet +0 -3
- data/datasets/local/imdb/text/near_dup/signal_manifest.json +0 -36
- data/datasets/local/imdb/text/pii/data-00000-of-00001.parquet +0 -3
- data/datasets/local/imdb/text/pii/signal_manifest.json +0 -45
- data/datasets/local/imdb/text/spacy_ner/data-00000-of-00001.parquet +0 -3
- data/datasets/local/imdb/text/spacy_ner/signal_manifest.json +0 -38
- data/datasets/local/imdb/text/text_statistics/data-00000-of-00001.parquet +0 -3
- data/datasets/local/imdb/text/text_statistics/signal_manifest.json +0 -59
- data/datasets/local/open-asssistant-conversations/data-00000-of-00001.parquet +0 -3
- data/datasets/local/open-asssistant-conversations/manifest.json +0 -118
- data/datasets/local/open-asssistant-conversations/settings.json +0 -1
- data/datasets/local/open-asssistant-conversations/text/gte-small/hnsw.hnswlib.bin +0 -3
- data/datasets/local/open-asssistant-conversations/text/gte-small/hnsw.lookup.pkl +0 -3
- data/datasets/local/open-asssistant-conversations/text/gte-small/signal_manifest.json +0 -35
- data/datasets/local/open-asssistant-conversations/text/gte-small/spans.pkl +0 -3
- data/datasets/local/open-asssistant-conversations/text/lang_detection(split_by_paragraph=True)/data-00000-of-00001.parquet +0 -3
- data/datasets/local/open-asssistant-conversations/text/lang_detection(split_by_paragraph=True)/signal_manifest.json +0 -38
- data/datasets/local/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet +0 -3
- data/datasets/local/open-asssistant-conversations/text/near_dup/signal_manifest.json +0 -36
- data/datasets/local/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet +0 -3
- data/datasets/local/open-asssistant-conversations/text/pii/signal_manifest.json +0 -45
- data/datasets/local/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet +0 -3
- data/datasets/local/open-asssistant-conversations/text/spacy_ner/signal_manifest.json +0 -38
- data/datasets/local/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet +0 -3
- data/datasets/local/open-asssistant-conversations/text/text_statistics/signal_manifest.json +0 -59
- lilac/concepts/db_concept.py +1 -0
.gitattributes
DELETED
@@ -1,18 +0,0 @@
|
|
1 |
-
data/datasets/local/open-asssistant-conversations/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
2 |
-
data/datasets/local/open-asssistant-conversations/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
3 |
-
data/datasets/local/open-asssistant-conversations/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
4 |
-
data/datasets/local/open-asssistant-conversations/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
5 |
-
data/datasets/local/open-asssistant-conversations/text/lang_detection(split_by_paragraph=True)/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
6 |
-
data/datasets/local/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
7 |
-
data/datasets/local/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
8 |
-
data/datasets/local/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
9 |
-
data/datasets/local/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
10 |
-
data/datasets/local/imdb/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
11 |
-
data/datasets/local/imdb/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
12 |
-
data/datasets/local/imdb/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
13 |
-
data/datasets/local/imdb/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
14 |
-
data/datasets/local/imdb/text/lang_detection(split_by_paragraph=True)/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
15 |
-
data/datasets/local/imdb/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
16 |
-
data/datasets/local/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
17 |
-
data/datasets/local/imdb/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
18 |
-
data/datasets/local/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: Lilac
|
3 |
emoji: 🌷
|
4 |
colorFrom: purple
|
5 |
colorTo: purple
|
|
|
1 |
---
|
2 |
+
title: Lilac
|
3 |
emoji: 🌷
|
4 |
colorFrom: purple
|
5 |
colorTo: purple
|
data/datasets/local/imdb/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:5cf3f121bae8b8d8c12af8bebe4cda35c2a84750470fff57ea37a4930c257d6f
|
3 |
-
size 86160733
|
|
|
|
|
|
|
|
data/datasets/local/imdb/manifest.json
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"data_schema": {
|
6 |
-
"fields": {
|
7 |
-
"text": {
|
8 |
-
"dtype": "string"
|
9 |
-
},
|
10 |
-
"label": {
|
11 |
-
"dtype": "string"
|
12 |
-
},
|
13 |
-
"__hfsplit__": {
|
14 |
-
"dtype": "string"
|
15 |
-
},
|
16 |
-
"__rowid__": {
|
17 |
-
"dtype": "string"
|
18 |
-
}
|
19 |
-
}
|
20 |
-
}
|
21 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/local/imdb/settings.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"ui": {"media_paths": [["text"]], "markdown_paths": []}, "preferred_embedding": "gte-small"}
|
|
|
|
data/datasets/local/imdb/text/gte-small/hnsw.hnswlib.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:4659a623093a2ef1646885a6ecb6ef86c56c2dcd0b10900d7b46d193dfb69e7f
|
3 |
-
size 691432464
|
|
|
|
|
|
|
|
data/datasets/local/imdb/text/gte-small/hnsw.lookup.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:7cbf4a5777b0cd1f8bb5061a6177b27cc0f5a8a6349c487c0c5c52fe60697d64
|
3 |
-
size 10390846
|
|
|
|
|
|
|
|
data/datasets/local/imdb/text/gte-small/signal_manifest.json
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [],
|
3 |
-
"parquet_id": "gte-small(text)",
|
4 |
-
"data_schema": {
|
5 |
-
"fields": {
|
6 |
-
"__rowid__": {
|
7 |
-
"dtype": "string"
|
8 |
-
},
|
9 |
-
"text": {
|
10 |
-
"fields": {
|
11 |
-
"gte-small": {
|
12 |
-
"repeated_field": {
|
13 |
-
"fields": {
|
14 |
-
"embedding": {
|
15 |
-
"dtype": "embedding"
|
16 |
-
}
|
17 |
-
},
|
18 |
-
"dtype": "string_span"
|
19 |
-
},
|
20 |
-
"signal": {
|
21 |
-
"signal_name": "gte-small"
|
22 |
-
}
|
23 |
-
}
|
24 |
-
}
|
25 |
-
}
|
26 |
-
}
|
27 |
-
},
|
28 |
-
"signal": {
|
29 |
-
"signal_name": "gte-small"
|
30 |
-
},
|
31 |
-
"enriched_path": [
|
32 |
-
"text"
|
33 |
-
],
|
34 |
-
"vector_store": "hnsw"
|
35 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/local/imdb/text/gte-small/spans.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:526e8505beb2386e3ff30367968685fd2229f76af2c0c86d50afaa7da3018dbc
|
3 |
-
size 7476546
|
|
|
|
|
|
|
|
data/datasets/local/imdb/text/lang_detection(split_by_paragraph=True)/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:3dd9ee881d6bf4fa2bb3a6db647d0c6d1f648b4a80b4e6d1aa081032bfddf5bc
|
3 |
-
size 3495763
|
|
|
|
|
|
|
|
data/datasets/local/imdb/text/lang_detection(split_by_paragraph=True)/signal_manifest.json
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "lang_detection(split_by_paragraph=True)(text)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"text": {
|
12 |
-
"fields": {
|
13 |
-
"lang_detection(split_by_paragraph=True)": {
|
14 |
-
"repeated_field": {
|
15 |
-
"fields": {
|
16 |
-
"lang_code": {
|
17 |
-
"dtype": "string"
|
18 |
-
}
|
19 |
-
},
|
20 |
-
"dtype": "string_span"
|
21 |
-
},
|
22 |
-
"signal": {
|
23 |
-
"split_by_paragraph": true,
|
24 |
-
"signal_name": "lang_detection"
|
25 |
-
}
|
26 |
-
}
|
27 |
-
}
|
28 |
-
}
|
29 |
-
}
|
30 |
-
},
|
31 |
-
"signal": {
|
32 |
-
"split_by_paragraph": true,
|
33 |
-
"signal_name": "lang_detection"
|
34 |
-
},
|
35 |
-
"enriched_path": [
|
36 |
-
"text"
|
37 |
-
]
|
38 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/local/imdb/text/near_dup/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:a1ccd12fc66d0c31a19554fcb5f442751807745e51c3a9336cec637525a422fc
|
3 |
-
size 3916036
|
|
|
|
|
|
|
|
data/datasets/local/imdb/text/near_dup/signal_manifest.json
DELETED
@@ -1,36 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "near_dup(text)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"text": {
|
12 |
-
"fields": {
|
13 |
-
"near_dup": {
|
14 |
-
"fields": {
|
15 |
-
"cluster_id": {
|
16 |
-
"dtype": "uint32",
|
17 |
-
"categorical": true
|
18 |
-
}
|
19 |
-
},
|
20 |
-
"signal": {
|
21 |
-
"threshold": 0.75,
|
22 |
-
"signal_name": "near_dup"
|
23 |
-
}
|
24 |
-
}
|
25 |
-
}
|
26 |
-
}
|
27 |
-
}
|
28 |
-
},
|
29 |
-
"signal": {
|
30 |
-
"threshold": 0.75,
|
31 |
-
"signal_name": "near_dup"
|
32 |
-
},
|
33 |
-
"enriched_path": [
|
34 |
-
"text"
|
35 |
-
]
|
36 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/local/imdb/text/pii/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:b4f1f559281ca4e3efcafd4b10c51cbe2f5039d86ce95d3dc07156671fd8b824
|
3 |
-
size 3313984
|
|
|
|
|
|
|
|
data/datasets/local/imdb/text/pii/signal_manifest.json
DELETED
@@ -1,45 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "pii(text)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"text": {
|
12 |
-
"fields": {
|
13 |
-
"pii": {
|
14 |
-
"fields": {
|
15 |
-
"emails": {
|
16 |
-
"repeated_field": {
|
17 |
-
"dtype": "string_span"
|
18 |
-
}
|
19 |
-
},
|
20 |
-
"ip_addresses": {
|
21 |
-
"repeated_field": {
|
22 |
-
"dtype": "string_span"
|
23 |
-
}
|
24 |
-
},
|
25 |
-
"secrets": {
|
26 |
-
"repeated_field": {
|
27 |
-
"dtype": "string_span"
|
28 |
-
}
|
29 |
-
}
|
30 |
-
},
|
31 |
-
"signal": {
|
32 |
-
"signal_name": "pii"
|
33 |
-
}
|
34 |
-
}
|
35 |
-
}
|
36 |
-
}
|
37 |
-
}
|
38 |
-
},
|
39 |
-
"signal": {
|
40 |
-
"signal_name": "pii"
|
41 |
-
},
|
42 |
-
"enriched_path": [
|
43 |
-
"text"
|
44 |
-
]
|
45 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/local/imdb/text/spacy_ner/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:19ce0e0966a4db29b7b862aa3fa87ef3b02997e57efcdd722023819caa1be7bb
|
3 |
-
size 8483750
|
|
|
|
|
|
|
|
data/datasets/local/imdb/text/spacy_ner/signal_manifest.json
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "spacy_ner(text)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"text": {
|
12 |
-
"fields": {
|
13 |
-
"spacy_ner": {
|
14 |
-
"repeated_field": {
|
15 |
-
"fields": {
|
16 |
-
"label": {
|
17 |
-
"dtype": "string"
|
18 |
-
}
|
19 |
-
},
|
20 |
-
"dtype": "string_span"
|
21 |
-
},
|
22 |
-
"signal": {
|
23 |
-
"model": "en_core_web_sm",
|
24 |
-
"signal_name": "spacy_ner"
|
25 |
-
}
|
26 |
-
}
|
27 |
-
}
|
28 |
-
}
|
29 |
-
}
|
30 |
-
},
|
31 |
-
"signal": {
|
32 |
-
"model": "en_core_web_sm",
|
33 |
-
"signal_name": "spacy_ner"
|
34 |
-
},
|
35 |
-
"enriched_path": [
|
36 |
-
"text"
|
37 |
-
]
|
38 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/local/imdb/text/text_statistics/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:672357a255fecf4e29604674ff3ceb11b6772d0388293f5267f608a6163faf49
|
3 |
-
size 4404092
|
|
|
|
|
|
|
|
data/datasets/local/imdb/text/text_statistics/signal_manifest.json
DELETED
@@ -1,59 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "text_statistics(text)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"text": {
|
12 |
-
"fields": {
|
13 |
-
"text_statistics": {
|
14 |
-
"fields": {
|
15 |
-
"num_characters": {
|
16 |
-
"dtype": "int32"
|
17 |
-
},
|
18 |
-
"readability": {
|
19 |
-
"dtype": "float32"
|
20 |
-
},
|
21 |
-
"log(type_token_ratio)": {
|
22 |
-
"dtype": "float32"
|
23 |
-
},
|
24 |
-
"frac_non_ascii": {
|
25 |
-
"dtype": "float32",
|
26 |
-
"bins": [
|
27 |
-
[
|
28 |
-
"Low",
|
29 |
-
null,
|
30 |
-
0.15
|
31 |
-
],
|
32 |
-
[
|
33 |
-
"Medium",
|
34 |
-
0.15,
|
35 |
-
0.3
|
36 |
-
],
|
37 |
-
[
|
38 |
-
"High",
|
39 |
-
0.3,
|
40 |
-
null
|
41 |
-
]
|
42 |
-
]
|
43 |
-
}
|
44 |
-
},
|
45 |
-
"signal": {
|
46 |
-
"signal_name": "text_statistics"
|
47 |
-
}
|
48 |
-
}
|
49 |
-
}
|
50 |
-
}
|
51 |
-
}
|
52 |
-
},
|
53 |
-
"signal": {
|
54 |
-
"signal_name": "text_statistics"
|
55 |
-
},
|
56 |
-
"enriched_path": [
|
57 |
-
"text"
|
58 |
-
]
|
59 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/local/open-asssistant-conversations/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:2557dc647ff10b0396e9b40f24468f599661c664ff777c62647605503dea94dc
|
3 |
-
size 42071787
|
|
|
|
|
|
|
|
data/datasets/local/open-asssistant-conversations/manifest.json
DELETED
@@ -1,118 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"data_schema": {
|
6 |
-
"fields": {
|
7 |
-
"message_id": {
|
8 |
-
"dtype": "string"
|
9 |
-
},
|
10 |
-
"parent_id": {
|
11 |
-
"dtype": "string"
|
12 |
-
},
|
13 |
-
"user_id": {
|
14 |
-
"dtype": "string"
|
15 |
-
},
|
16 |
-
"created_date": {
|
17 |
-
"dtype": "string"
|
18 |
-
},
|
19 |
-
"text": {
|
20 |
-
"dtype": "string"
|
21 |
-
},
|
22 |
-
"role": {
|
23 |
-
"dtype": "string"
|
24 |
-
},
|
25 |
-
"lang": {
|
26 |
-
"dtype": "string"
|
27 |
-
},
|
28 |
-
"review_count": {
|
29 |
-
"dtype": "int32"
|
30 |
-
},
|
31 |
-
"review_result": {
|
32 |
-
"dtype": "boolean"
|
33 |
-
},
|
34 |
-
"deleted": {
|
35 |
-
"dtype": "boolean"
|
36 |
-
},
|
37 |
-
"rank": {
|
38 |
-
"dtype": "int32"
|
39 |
-
},
|
40 |
-
"synthetic": {
|
41 |
-
"dtype": "boolean"
|
42 |
-
},
|
43 |
-
"model_name": {
|
44 |
-
"dtype": "string"
|
45 |
-
},
|
46 |
-
"detoxify": {
|
47 |
-
"fields": {
|
48 |
-
"toxicity": {
|
49 |
-
"dtype": "float64"
|
50 |
-
},
|
51 |
-
"severe_toxicity": {
|
52 |
-
"dtype": "float64"
|
53 |
-
},
|
54 |
-
"obscene": {
|
55 |
-
"dtype": "float64"
|
56 |
-
},
|
57 |
-
"identity_attack": {
|
58 |
-
"dtype": "float64"
|
59 |
-
},
|
60 |
-
"insult": {
|
61 |
-
"dtype": "float64"
|
62 |
-
},
|
63 |
-
"threat": {
|
64 |
-
"dtype": "float64"
|
65 |
-
},
|
66 |
-
"sexual_explicit": {
|
67 |
-
"dtype": "float64"
|
68 |
-
}
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"message_tree_id": {
|
72 |
-
"dtype": "string"
|
73 |
-
},
|
74 |
-
"tree_state": {
|
75 |
-
"dtype": "string"
|
76 |
-
},
|
77 |
-
"emojis": {
|
78 |
-
"fields": {
|
79 |
-
"name": {
|
80 |
-
"repeated_field": {
|
81 |
-
"dtype": "string"
|
82 |
-
}
|
83 |
-
},
|
84 |
-
"count": {
|
85 |
-
"repeated_field": {
|
86 |
-
"dtype": "int32"
|
87 |
-
}
|
88 |
-
}
|
89 |
-
}
|
90 |
-
},
|
91 |
-
"labels": {
|
92 |
-
"fields": {
|
93 |
-
"name": {
|
94 |
-
"repeated_field": {
|
95 |
-
"dtype": "string"
|
96 |
-
}
|
97 |
-
},
|
98 |
-
"value": {
|
99 |
-
"repeated_field": {
|
100 |
-
"dtype": "float64"
|
101 |
-
}
|
102 |
-
},
|
103 |
-
"count": {
|
104 |
-
"repeated_field": {
|
105 |
-
"dtype": "int32"
|
106 |
-
}
|
107 |
-
}
|
108 |
-
}
|
109 |
-
},
|
110 |
-
"__hfsplit__": {
|
111 |
-
"dtype": "string"
|
112 |
-
},
|
113 |
-
"__rowid__": {
|
114 |
-
"dtype": "string"
|
115 |
-
}
|
116 |
-
}
|
117 |
-
}
|
118 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/local/open-asssistant-conversations/settings.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"ui": {"media_paths": [["text"]], "markdown_paths": []}, "preferred_embedding": "gte-small"}
|
|
|
|
data/datasets/local/open-asssistant-conversations/text/gte-small/hnsw.hnswlib.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:1ef42015e1cfa76fc929c0a8913911c765e871586af7eac6f42def6abbd856f5
|
3 |
-
size 327991004
|
|
|
|
|
|
|
|
data/datasets/local/open-asssistant-conversations/text/gte-small/hnsw.lookup.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:4370d3885f9dea3df44fc7e366069c83c93af3b068ed5a56eaa2ac442c4f502c
|
3 |
-
size 6171229
|
|
|
|
|
|
|
|
data/datasets/local/open-asssistant-conversations/text/gte-small/signal_manifest.json
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [],
|
3 |
-
"parquet_id": "gte-small(text)",
|
4 |
-
"data_schema": {
|
5 |
-
"fields": {
|
6 |
-
"__rowid__": {
|
7 |
-
"dtype": "string"
|
8 |
-
},
|
9 |
-
"text": {
|
10 |
-
"fields": {
|
11 |
-
"gte-small": {
|
12 |
-
"repeated_field": {
|
13 |
-
"fields": {
|
14 |
-
"embedding": {
|
15 |
-
"dtype": "embedding"
|
16 |
-
}
|
17 |
-
},
|
18 |
-
"dtype": "string_span"
|
19 |
-
},
|
20 |
-
"signal": {
|
21 |
-
"signal_name": "gte-small"
|
22 |
-
}
|
23 |
-
}
|
24 |
-
}
|
25 |
-
}
|
26 |
-
}
|
27 |
-
},
|
28 |
-
"signal": {
|
29 |
-
"signal_name": "gte-small"
|
30 |
-
},
|
31 |
-
"enriched_path": [
|
32 |
-
"text"
|
33 |
-
],
|
34 |
-
"vector_store": "hnsw"
|
35 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/local/open-asssistant-conversations/text/gte-small/spans.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:e9b6962f47eefe550b314cdb4d6c6eb0811670f5f41d137b952fcc55e1d331cc
|
3 |
-
size 5164058
|
|
|
|
|
|
|
|
data/datasets/local/open-asssistant-conversations/text/lang_detection(split_by_paragraph=True)/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:17cf7d5bd4e5b74dbe3024da1e4115c013b65626901916a0aa471e79ba88d1b1
|
3 |
-
size 3765373
|
|
|
|
|
|
|
|
data/datasets/local/open-asssistant-conversations/text/lang_detection(split_by_paragraph=True)/signal_manifest.json
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "lang_detection(split_by_paragraph=True)(text)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"text": {
|
12 |
-
"fields": {
|
13 |
-
"lang_detection(split_by_paragraph=True)": {
|
14 |
-
"repeated_field": {
|
15 |
-
"fields": {
|
16 |
-
"lang_code": {
|
17 |
-
"dtype": "string"
|
18 |
-
}
|
19 |
-
},
|
20 |
-
"dtype": "string_span"
|
21 |
-
},
|
22 |
-
"signal": {
|
23 |
-
"split_by_paragraph": true,
|
24 |
-
"signal_name": "lang_detection"
|
25 |
-
}
|
26 |
-
}
|
27 |
-
}
|
28 |
-
}
|
29 |
-
}
|
30 |
-
},
|
31 |
-
"signal": {
|
32 |
-
"split_by_paragraph": true,
|
33 |
-
"signal_name": "lang_detection"
|
34 |
-
},
|
35 |
-
"enriched_path": [
|
36 |
-
"text"
|
37 |
-
]
|
38 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/local/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:43c99611fc94cdd4998e03f18f651fe2ea7b515a5780bbcb78baa2030a3b39b1
|
3 |
-
size 3485154
|
|
|
|
|
|
|
|
data/datasets/local/open-asssistant-conversations/text/near_dup/signal_manifest.json
DELETED
@@ -1,36 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "near_dup(text)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"text": {
|
12 |
-
"fields": {
|
13 |
-
"near_dup": {
|
14 |
-
"fields": {
|
15 |
-
"cluster_id": {
|
16 |
-
"dtype": "uint32",
|
17 |
-
"categorical": true
|
18 |
-
}
|
19 |
-
},
|
20 |
-
"signal": {
|
21 |
-
"threshold": 0.75,
|
22 |
-
"signal_name": "near_dup"
|
23 |
-
}
|
24 |
-
}
|
25 |
-
}
|
26 |
-
}
|
27 |
-
}
|
28 |
-
},
|
29 |
-
"signal": {
|
30 |
-
"threshold": 0.75,
|
31 |
-
"signal_name": "near_dup"
|
32 |
-
},
|
33 |
-
"enriched_path": [
|
34 |
-
"text"
|
35 |
-
]
|
36 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/local/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:1cc6966d1c3c262121fa6130ff54e4ba7431d89ae81dfbc9ef9025f31bf095be
|
3 |
-
size 2953280
|
|
|
|
|
|
|
|
data/datasets/local/open-asssistant-conversations/text/pii/signal_manifest.json
DELETED
@@ -1,45 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "pii(text)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"text": {
|
12 |
-
"fields": {
|
13 |
-
"pii": {
|
14 |
-
"fields": {
|
15 |
-
"emails": {
|
16 |
-
"repeated_field": {
|
17 |
-
"dtype": "string_span"
|
18 |
-
}
|
19 |
-
},
|
20 |
-
"ip_addresses": {
|
21 |
-
"repeated_field": {
|
22 |
-
"dtype": "string_span"
|
23 |
-
}
|
24 |
-
},
|
25 |
-
"secrets": {
|
26 |
-
"repeated_field": {
|
27 |
-
"dtype": "string_span"
|
28 |
-
}
|
29 |
-
}
|
30 |
-
},
|
31 |
-
"signal": {
|
32 |
-
"signal_name": "pii"
|
33 |
-
}
|
34 |
-
}
|
35 |
-
}
|
36 |
-
}
|
37 |
-
}
|
38 |
-
},
|
39 |
-
"signal": {
|
40 |
-
"signal_name": "pii"
|
41 |
-
},
|
42 |
-
"enriched_path": [
|
43 |
-
"text"
|
44 |
-
]
|
45 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/local/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:971edbdc4bdfad768444691a2e276f3c69e99a9f5251168aaa7fd2a89a649043
|
3 |
-
size 5955494
|
|
|
|
|
|
|
|
data/datasets/local/open-asssistant-conversations/text/spacy_ner/signal_manifest.json
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "spacy_ner(text)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"text": {
|
12 |
-
"fields": {
|
13 |
-
"spacy_ner": {
|
14 |
-
"repeated_field": {
|
15 |
-
"fields": {
|
16 |
-
"label": {
|
17 |
-
"dtype": "string"
|
18 |
-
}
|
19 |
-
},
|
20 |
-
"dtype": "string_span"
|
21 |
-
},
|
22 |
-
"signal": {
|
23 |
-
"model": "en_core_web_sm",
|
24 |
-
"signal_name": "spacy_ner"
|
25 |
-
}
|
26 |
-
}
|
27 |
-
}
|
28 |
-
}
|
29 |
-
}
|
30 |
-
},
|
31 |
-
"signal": {
|
32 |
-
"model": "en_core_web_sm",
|
33 |
-
"signal_name": "spacy_ner"
|
34 |
-
},
|
35 |
-
"enriched_path": [
|
36 |
-
"text"
|
37 |
-
]
|
38 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/datasets/local/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:a1adb0c31662191820bcffbefd09c00ecdc101bbc41b9941179ff0b4fd78d11b
|
3 |
-
size 3827236
|
|
|
|
|
|
|
|
data/datasets/local/open-asssistant-conversations/text/text_statistics/signal_manifest.json
DELETED
@@ -1,59 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"files": [
|
3 |
-
"data-00000-of-00001.parquet"
|
4 |
-
],
|
5 |
-
"parquet_id": "text_statistics(text)",
|
6 |
-
"data_schema": {
|
7 |
-
"fields": {
|
8 |
-
"__rowid__": {
|
9 |
-
"dtype": "string"
|
10 |
-
},
|
11 |
-
"text": {
|
12 |
-
"fields": {
|
13 |
-
"text_statistics": {
|
14 |
-
"fields": {
|
15 |
-
"num_characters": {
|
16 |
-
"dtype": "int32"
|
17 |
-
},
|
18 |
-
"readability": {
|
19 |
-
"dtype": "float32"
|
20 |
-
},
|
21 |
-
"log(type_token_ratio)": {
|
22 |
-
"dtype": "float32"
|
23 |
-
},
|
24 |
-
"frac_non_ascii": {
|
25 |
-
"dtype": "float32",
|
26 |
-
"bins": [
|
27 |
-
[
|
28 |
-
"Low",
|
29 |
-
null,
|
30 |
-
0.15
|
31 |
-
],
|
32 |
-
[
|
33 |
-
"Medium",
|
34 |
-
0.15,
|
35 |
-
0.3
|
36 |
-
],
|
37 |
-
[
|
38 |
-
"High",
|
39 |
-
0.3,
|
40 |
-
null
|
41 |
-
]
|
42 |
-
]
|
43 |
-
}
|
44 |
-
},
|
45 |
-
"signal": {
|
46 |
-
"signal_name": "text_statistics"
|
47 |
-
}
|
48 |
-
}
|
49 |
-
}
|
50 |
-
}
|
51 |
-
}
|
52 |
-
},
|
53 |
-
"signal": {
|
54 |
-
"signal_name": "text_statistics"
|
55 |
-
},
|
56 |
-
"enriched_path": [
|
57 |
-
"text"
|
58 |
-
]
|
59 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lilac/concepts/db_concept.py
CHANGED
@@ -366,6 +366,7 @@ class DiskConceptDB(ConceptDB):
|
|
366 |
f'Concept "{namespace}/{name}" does not exist or user does not have access.')
|
367 |
|
368 |
concept_json_path = _concept_json_path(self._get_base_dir(), namespace, name)
|
|
|
369 |
if not file_exists(concept_json_path):
|
370 |
return None
|
371 |
|
|
|
366 |
f'Concept "{namespace}/{name}" does not exist or user does not have access.')
|
367 |
|
368 |
concept_json_path = _concept_json_path(self._get_base_dir(), namespace, name)
|
369 |
+
print('json path=', concept_json_path)
|
370 |
if not file_exists(concept_json_path):
|
371 |
return None
|
372 |
|