Shankhdhar
commited on
Commit
•
2ec76c9
1
Parent(s):
5944e36
Some changes
Browse files- .gitattributes +2 -0
- text_collection/text_collection.py +0 -104
- text_collection/text_collection.py.lock +0 -0
- train/train2.txt +0 -3
- train/val2.txt +0 -0
- val/val2.txt +0 -0
.gitattributes
CHANGED
@@ -15,6 +15,8 @@
|
|
15 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
16 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
17 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
18 |
Train.tsv filter=lfs diff=lfs merge=lfs -text
|
19 |
train/train2.tsv filter=lfs diff=lfs merge=lfs -text
|
20 |
train/train2.txt filter=lfs diff=lfs merge=lfs -text
|
|
|
15 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
16 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
17 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
18 |
+
“*.tsv” filter=lfs diff=lfs merge=lfs -text
|
19 |
+
“*.txt” filter=lfs diff=lfs merge=lfs -text
|
20 |
Train.tsv filter=lfs diff=lfs merge=lfs -text
|
21 |
train/train2.tsv filter=lfs diff=lfs merge=lfs -text
|
22 |
train/train2.txt filter=lfs diff=lfs merge=lfs -text
|
text_collection/text_collection.py
DELETED
@@ -1,104 +0,0 @@
|
|
1 |
-
# coding=utf-8
|
2 |
-
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
3 |
-
#
|
4 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
-
# you may not use this file except in compliance with the License.
|
6 |
-
# You may obtain a copy of the License at
|
7 |
-
#
|
8 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
-
#
|
10 |
-
# Unless required by applicable law or agreed to in writing, software
|
11 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
-
# See the License for the specific language governing permissions and
|
14 |
-
# limitations under the License.
|
15 |
-
"""Large-scale Indonesian Summarization Dataset"""
|
16 |
-
import glob
|
17 |
-
import json
|
18 |
-
import os
|
19 |
-
import re
|
20 |
-
from pathlib import Path
|
21 |
-
import datasets
|
22 |
-
logger = datasets.logging.get_logger(__name__)
|
23 |
-
_CITATION = """\
|
24 |
-
"""
|
25 |
-
_DESCRIPTION = """\
|
26 |
-
This module load text dataset from local directory. The text dataset should have the format like Oscar dataset
|
27 |
-
where each new entry is separated by empty lines.
|
28 |
-
"""
|
29 |
-
_HOMEPAGE = ""
|
30 |
-
_LICENSE = ""
|
31 |
-
class TextCollectionConfig(datasets.BuilderConfig):
|
32 |
-
"""BuilderConfig for TextCollection"""
|
33 |
-
def __init__(self, **kwargs):
|
34 |
-
"""BuilderConfig for TextCollection.
|
35 |
-
Args:
|
36 |
-
**kwargs: keyword arguments forwarded to super.
|
37 |
-
"""
|
38 |
-
super(TextCollectionConfig, self).__init__(**kwargs)
|
39 |
-
class TextCollection(datasets.GeneratorBasedBuilder):
|
40 |
-
VERSION = datasets.Version("1.0.0")
|
41 |
-
BUILDER_CONFIGS = [
|
42 |
-
TextCollectionConfig(
|
43 |
-
name="text_collection",
|
44 |
-
version=VERSION,
|
45 |
-
description="Id Collection dataset",
|
46 |
-
),
|
47 |
-
]
|
48 |
-
@property
|
49 |
-
def manual_download_instructions(self):
|
50 |
-
return """\
|
51 |
-
You need to manually collect text datasets in a directory. The text dataset can then be loaded
|
52 |
-
using the following command:
|
53 |
-
`datasets.load_dataset("text_collection", data_dir="<path/to/dataset>")`.
|
54 |
-
"""
|
55 |
-
def _info(self):
|
56 |
-
return datasets.DatasetInfo(
|
57 |
-
description=_DESCRIPTION,
|
58 |
-
features=datasets.Features({"id": datasets.Value("int64"), "text": datasets.Value("string")}),
|
59 |
-
supervised_keys=None,
|
60 |
-
homepage=_HOMEPAGE,
|
61 |
-
license=_LICENSE,
|
62 |
-
citation=_CITATION,
|
63 |
-
)
|
64 |
-
def _split_generators(self, dl_manager):
|
65 |
-
data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
|
66 |
-
print("# Data directory", data_dir)
|
67 |
-
if not os.path.exists(data_dir):
|
68 |
-
raise FileNotFoundError(
|
69 |
-
"{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('id_liputan6', "
|
70 |
-
"'canonical', data_dir=...)`. Manual download instructions:\n{}".format(
|
71 |
-
data_dir, self.manual_download_instructions
|
72 |
-
)
|
73 |
-
)
|
74 |
-
split_generators = [
|
75 |
-
datasets.SplitGenerator(
|
76 |
-
name=datasets.Split.TRAIN,
|
77 |
-
gen_kwargs={
|
78 |
-
"article_dir": os.path.join(data_dir, ""),
|
79 |
-
"split": "train",
|
80 |
-
},
|
81 |
-
)
|
82 |
-
]
|
83 |
-
return split_generators
|
84 |
-
def _generate_examples(self, article_dir, split):
|
85 |
-
logger.info("⏳ Generating %s examples from = %s", split, article_dir)
|
86 |
-
id_ = 0
|
87 |
-
current_lines = []
|
88 |
-
for path in sorted(glob.glob(os.path.join(article_dir, "**/*.txt"), recursive=True)):
|
89 |
-
with open(path, "r") as f:
|
90 |
-
print("# Reading", path)
|
91 |
-
for line in f:
|
92 |
-
if len(line.strip()) > 0:
|
93 |
-
current_lines.append(line)
|
94 |
-
elif current_lines:
|
95 |
-
feature = id_, {"id": id_, "text": "".join(current_lines).rstrip()}
|
96 |
-
yield feature
|
97 |
-
id_ += 1
|
98 |
-
current_lines = []
|
99 |
-
# last paragraph
|
100 |
-
if current_lines:
|
101 |
-
feature = id_, {"id": id_, "text": "".join(current_lines).rstrip()}
|
102 |
-
yield feature
|
103 |
-
id_ += 1
|
104 |
-
current_lines = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text_collection/text_collection.py.lock
DELETED
File without changes
|
train/train2.txt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:5874cf342a7153d6949d6ecdbe72cabfde2bede960a4409bd7f82e88d6d4ed0f
|
3 |
-
size 19715139
|
|
|
|
|
|
|
|
train/val2.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|
val/val2.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|