File size: 1,554 Bytes
9c642b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import os
import pyarrow.parquet as pq
from joblib import Parallel, delayed
def extract_text_from_parquet(parquet_file, output_dir):
# Read the Parquet file
table = pq.read_table(parquet_file)
# Convert the table to a Pandas DataFrame
df = table.to_pandas()
# Iterate over each row in the DataFrame
for _, row in df.iterrows():
doc_id = row['doc_id']
text = row['text']
# Create the output file path
output_file = os.path.join(output_dir, f"{doc_id}.txt")
# Write the text to the output file
with open(output_file, 'w', encoding='utf-8') as file:
file.write(text)
print(f"Extracted text for doc_id: {doc_id}")
def process_parquet_file(parquet_file, parquet_directory, output_directory):
parquet_file_path = os.path.join(parquet_directory, parquet_file)
extract_text_from_parquet(parquet_file_path, output_directory)
def main():
parquet_directory = 'hindi'
output_directory = 'txt/'
# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)
# Get a list of all Parquet files in the directory
parquet_files = [file for file in os.listdir(parquet_directory) if file.endswith('.parquet')]
# Use joblib to parallelizes the extraction of text from Parquet files
Parallel(n_jobs=-1)(delayed(process_parquet_file)(parquet_file, parquet_directory, output_directory) for parquet_file in parquet_files)
if __name__ == '__main__':
main()
|