File size: 1,554 Bytes
9c642b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import os
import pyarrow.parquet as pq
from joblib import Parallel, delayed

def extract_text_from_parquet(parquet_file, output_dir):
    # Read the Parquet file
    table = pq.read_table(parquet_file)
    
    # Convert the table to a Pandas DataFrame
    df = table.to_pandas()
    
    # Iterate over each row in the DataFrame
    for _, row in df.iterrows():
        doc_id = row['doc_id']
        text = row['text']
        
        # Create the output file path
        output_file = os.path.join(output_dir, f"{doc_id}.txt")
        
        # Write the text to the output file
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(text)
        
        print(f"Extracted text for doc_id: {doc_id}")

def process_parquet_file(parquet_file, parquet_directory, output_directory):
    parquet_file_path = os.path.join(parquet_directory, parquet_file)
    extract_text_from_parquet(parquet_file_path, output_directory)

def main():
    parquet_directory = 'hindi'
    output_directory = 'txt/'
    
    # Create the output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    # Get a list of all Parquet files in the directory
    parquet_files = [file for file in os.listdir(parquet_directory) if file.endswith('.parquet')]
    
    # Use joblib to parallelizes the extraction of text from Parquet files
    Parallel(n_jobs=-1)(delayed(process_parquet_file)(parquet_file, parquet_directory, output_directory) for parquet_file in parquet_files)
if __name__ == '__main__':
    main()