Some changes

Browse files

Files changed (6) hide show

.gitattributes +2 -0
text_collection/text_collection.py +0 -104
text_collection/text_collection.py.lock +0 -0
train/train2.txt +0 -3
train/val2.txt +0 -0
val/val2.txt +0 -0

.gitattributes CHANGED Viewed

@@ -15,6 +15,8 @@
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 Train.tsv filter=lfs diff=lfs merge=lfs -text
 train/train2.tsv filter=lfs diff=lfs merge=lfs -text
 train/train2.txt filter=lfs diff=lfs merge=lfs -text

 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+“*.tsv” filter=lfs diff=lfs merge=lfs -text
+“*.txt” filter=lfs diff=lfs merge=lfs -text
 Train.tsv filter=lfs diff=lfs merge=lfs -text
 train/train2.tsv filter=lfs diff=lfs merge=lfs -text
 train/train2.txt filter=lfs diff=lfs merge=lfs -text

text_collection/text_collection.py DELETED Viewed

@@ -1,104 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Large-scale Indonesian Summarization Dataset"""
-import glob
-import json
-import os
-import re
-from pathlib import Path
-import datasets
-logger = datasets.logging.get_logger(__name__)
-_CITATION = """\
-"""
-_DESCRIPTION = """\
-This module load text dataset from local directory. The text dataset should have the format like Oscar dataset
-where each new entry is separated by empty lines.
-"""
-_HOMEPAGE = ""
-_LICENSE = ""
-class TextCollectionConfig(datasets.BuilderConfig):
-    """BuilderConfig for TextCollection"""
-    def __init__(self, **kwargs):
-        """BuilderConfig for TextCollection.
-        Args:
-          **kwargs: keyword arguments forwarded to super.
-        """
-        super(TextCollectionConfig, self).__init__(**kwargs)
-class TextCollection(datasets.GeneratorBasedBuilder):
-    VERSION = datasets.Version("1.0.0")
-    BUILDER_CONFIGS = [
-        TextCollectionConfig(
-            name="text_collection",
-            version=VERSION,
-            description="Id Collection dataset",
-        ),
-    ]
-    @property
-    def manual_download_instructions(self):
-        return """\
-            You need to manually collect text datasets in a directory.  The text dataset can then be loaded
-            using the following command:
-            `datasets.load_dataset("text_collection", data_dir="<path/to/dataset>")`.
-            """
-    def _info(self):
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=datasets.Features({"id": datasets.Value("int64"), "text": datasets.Value("string")}),
-            supervised_keys=None,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-    def _split_generators(self, dl_manager):
-        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
-        print("# Data directory", data_dir)
-        if not os.path.exists(data_dir):
-            raise FileNotFoundError(
-                "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('id_liputan6', "
-                "'canonical', data_dir=...)`. Manual download instructions:\n{}".format(
-                    data_dir, self.manual_download_instructions
-                )
-            )
-        split_generators = [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={
-                    "article_dir": os.path.join(data_dir, ""),
-                    "split": "train",
-                },
-            )
-        ]
-        return split_generators
-    def _generate_examples(self, article_dir, split):
-        logger.info("⏳ Generating %s examples from = %s", split, article_dir)
-        id_ = 0
-        current_lines = []
-        for path in sorted(glob.glob(os.path.join(article_dir, "**/*.txt"), recursive=True)):
-            with open(path, "r") as f:
-                print("# Reading", path)
-                for line in f:
-                    if len(line.strip()) > 0:
-                        current_lines.append(line)
-                    elif current_lines:
-                        feature = id_, {"id": id_, "text": "".join(current_lines).rstrip()}
-                        yield feature
-                        id_ += 1
-                        current_lines = []
-                # last paragraph
-                if current_lines:
-                    feature = id_, {"id": id_, "text": "".join(current_lines).rstrip()}
-                    yield feature
-                    id_ += 1
-                    current_lines = []

text_collection/text_collection.py.lock DELETED Viewed

File without changes

train/train2.txt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5874cf342a7153d6949d6ecdbe72cabfde2bede960a4409bd7f82e88d6d4ed0f
-size 19715139

train/val2.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

val/val2.txt DELETED Viewed

The diff for this file is too large to render. See raw diff