LoCo / dataset /tsv.py
Pusheen's picture
Upload 139 files
281df87 verified
raw
history blame
7.29 kB
import os
import os.path as op
import gc
import json
from typing import List
import logging
try:
from .blob_storage import BlobStorage, disk_usage
except:
class BlobStorage:
pass
def generate_lineidx(filein: str, idxout: str) -> None:
idxout_tmp = idxout + '.tmp'
with open(filein, 'r') as tsvin, open(idxout_tmp, 'w') as tsvout:
fsize = os.fstat(tsvin.fileno()).st_size
fpos = 0
while fpos != fsize:
tsvout.write(str(fpos) + "\n")
tsvin.readline()
fpos = tsvin.tell()
os.rename(idxout_tmp, idxout)
def read_to_character(fp, c):
result = []
while True:
s = fp.read(32)
assert s != ''
if c in s:
result.append(s[: s.index(c)])
break
else:
result.append(s)
return ''.join(result)
class TSVFile(object):
def __init__(self,
tsv_file: str,
if_generate_lineidx: bool = False,
lineidx: str = None,
class_selector: List[str] = None,
blob_storage: BlobStorage = None):
self.tsv_file = tsv_file
self.lineidx = op.splitext(tsv_file)[0] + '.lineidx' \
if not lineidx else lineidx
self.linelist = op.splitext(tsv_file)[0] + '.linelist'
self.chunks = op.splitext(tsv_file)[0] + '.chunks'
self._fp = None
self._lineidx = None
self._sample_indices = None
self._class_boundaries = None
self._class_selector = class_selector
self._blob_storage = blob_storage
self._len = None
# the process always keeps the process which opens the file.
# If the pid is not equal to the currrent pid, we will re-open the file.
self.pid = None
# generate lineidx if not exist
if not op.isfile(self.lineidx) and if_generate_lineidx:
generate_lineidx(self.tsv_file, self.lineidx)
def __del__(self):
self.gcidx()
if self._fp:
self._fp.close()
# physically remove the tsv file if it is retrieved by BlobStorage
if self._blob_storage and 'azcopy' in self.tsv_file and os.path.exists(self.tsv_file):
try:
original_usage = disk_usage('/')
os.remove(self.tsv_file)
logging.info("Purged %s (disk usage: %.2f%% => %.2f%%)" %
(self.tsv_file, original_usage, disk_usage('/') * 100))
except:
# Known issue: multiple threads attempting to delete the file will raise a FileNotFound error.
# TODO: try Threadling.Lock to better handle the race condition
pass
def __str__(self):
return "TSVFile(tsv_file='{}')".format(self.tsv_file)
def __repr__(self):
return str(self)
def gcidx(self):
logging.debug('Run gc collect')
self._lineidx = None
self._sample_indices = None
#self._class_boundaries = None
return gc.collect()
def get_class_boundaries(self):
return self._class_boundaries
def num_rows(self, gcf=False):
if (self._len is None):
self._ensure_lineidx_loaded()
retval = len(self._sample_indices)
if (gcf):
self.gcidx()
self._len = retval
return self._len
def seek(self, idx: int):
self._ensure_tsv_opened()
self._ensure_lineidx_loaded()
try:
pos = self._lineidx[self._sample_indices[idx]]
except:
logging.info('=> {}-{}'.format(self.tsv_file, idx))
raise
self._fp.seek(pos)
return [s.strip() for s in self._fp.readline().split('\t')]
def seek_first_column(self, idx: int):
self._ensure_tsv_opened()
self._ensure_lineidx_loaded()
pos = self._lineidx[idx]
self._fp.seek(pos)
return read_to_character(self._fp, '\t')
def get_key(self, idx: int):
return self.seek_first_column(idx)
def __getitem__(self, index: int):
return self.seek(index)
def __len__(self):
return self.num_rows()
def _ensure_lineidx_loaded(self):
if self._lineidx is None:
logging.debug('=> loading lineidx: {}'.format(self.lineidx))
with open(self.lineidx, 'r') as fp:
lines = fp.readlines()
lines = [line.strip() for line in lines]
self._lineidx = [int(line) for line in lines]
# read the line list if exists
linelist = None
if op.isfile(self.linelist):
with open(self.linelist, 'r') as fp:
linelist = sorted(
[
int(line.strip())
for line in fp.readlines()
]
)
if op.isfile(self.chunks):
self._sample_indices = []
self._class_boundaries = []
class_boundaries = json.load(open(self.chunks, 'r'))
for class_name, boundary in class_boundaries.items():
start = len(self._sample_indices)
if class_name in self._class_selector:
for idx in range(boundary[0], boundary[1] + 1):
# NOTE: potentially slow when linelist is long, try to speed it up
if linelist and idx not in linelist:
continue
self._sample_indices.append(idx)
end = len(self._sample_indices)
self._class_boundaries.append((start, end))
else:
if linelist:
self._sample_indices = linelist
else:
self._sample_indices = list(range(len(self._lineidx)))
def _ensure_tsv_opened(self):
if self._fp is None:
if self._blob_storage:
self._fp = self._blob_storage.open(self.tsv_file)
else:
self._fp = open(self.tsv_file, 'r')
self.pid = os.getpid()
if self.pid != os.getpid():
logging.debug('=> re-open {} because the process id changed'.format(self.tsv_file))
self._fp = open(self.tsv_file, 'r')
self.pid = os.getpid()
class TSVWriter(object):
def __init__(self, tsv_file):
self.tsv_file = tsv_file
self.lineidx_file = op.splitext(tsv_file)[0] + '.lineidx'
self.tsv_file_tmp = self.tsv_file + '.tmp'
self.lineidx_file_tmp = self.lineidx_file + '.tmp'
self.tsv_fp = open(self.tsv_file_tmp, 'w')
self.lineidx_fp = open(self.lineidx_file_tmp, 'w')
self.idx = 0
def write(self, values, sep='\t'):
v = '{0}\n'.format(sep.join(map(str, values)))
self.tsv_fp.write(v)
self.lineidx_fp.write(str(self.idx) + '\n')
self.idx = self.idx + len(v)
def close(self):
self.tsv_fp.close()
self.lineidx_fp.close()
os.rename(self.tsv_file_tmp, self.tsv_file)
os.rename(self.lineidx_file_tmp, self.lineidx_file)