|
import os |
|
import pyarrow.parquet as pq |
|
from joblib import Parallel, delayed |
|
|
|
def extract_text_from_parquet(parquet_file, output_dir): |
|
|
|
table = pq.read_table(parquet_file) |
|
|
|
|
|
df = table.to_pandas() |
|
|
|
|
|
for _, row in df.iterrows(): |
|
doc_id = row['doc_id'] |
|
text = row['text'] |
|
|
|
|
|
output_file = os.path.join(output_dir, f"{doc_id}.txt") |
|
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as file: |
|
file.write(text) |
|
|
|
print(f"Extracted text for doc_id: {doc_id}") |
|
|
|
def process_parquet_file(parquet_file, parquet_directory, output_directory): |
|
parquet_file_path = os.path.join(parquet_directory, parquet_file) |
|
extract_text_from_parquet(parquet_file_path, output_directory) |
|
|
|
def main(): |
|
parquet_directory = 'hindi' |
|
output_directory = 'txt/' |
|
|
|
|
|
os.makedirs(output_directory, exist_ok=True) |
|
|
|
|
|
parquet_files = [file for file in os.listdir(parquet_directory) if file.endswith('.parquet')] |
|
|
|
|
|
Parallel(n_jobs=-1)(delayed(process_parquet_file)(parquet_file, parquet_directory, output_directory) for parquet_file in parquet_files) |
|
if __name__ == '__main__': |
|
main() |
|
|