Spaces:
Build error
Build error
Commit
•
6404d3b
1
Parent(s):
676ed72
Added preprocessing code
Browse files- .gitignore +4 -0
- README.md +6 -0
- preprocess_wiki.py +167 -0
- requirements.txt +1 -0
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.bz2
|
2 |
+
*.gz
|
3 |
+
output/
|
4 |
+
.idea/
|
README.md
CHANGED
@@ -11,3 +11,9 @@ license: apache-2.0
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
14 |
+
|
15 |
+
# Installation
|
16 |
+
pip install requirements.txt
|
17 |
+
# Pre-processing
|
18 |
+
wget https://dumps.wikimedia.org/arwiki/latest/arwiki-latest-pages-articles-multistream.xml.bz2
|
19 |
+
wikiextractor -o output --json arwiki-latest-pages-articles-multistream.xml.bz2
|
preprocess_wiki.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from pathlib import Path
|
4 |
+
from tqdm.auto import tqdm
|
5 |
+
from typing import List, Any, Dict
|
6 |
+
|
7 |
+
MAX_WORDS = 250
|
8 |
+
|
9 |
+
|
10 |
+
def folder_to_json(folder_in: Path, json_path: Path) -> List[Any]:
|
11 |
+
"""
|
12 |
+
Process JSON lines from files in a given folder and write processed data to a new JSON file.
|
13 |
+
|
14 |
+
Parameters:
|
15 |
+
folder_in (Path): Path to the input folder containing the JSON files to process.
|
16 |
+
json_path (Path): Path to the output JSON file where the processed data will be written.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
List[Any]: List containing processed JSON data from all files in the input folder.
|
20 |
+
|
21 |
+
Example:
|
22 |
+
folder_to_json(Path("/path/to/input/folder"), Path("/path/to/output.json"))
|
23 |
+
"""
|
24 |
+
|
25 |
+
folder_in = Path(folder_in)
|
26 |
+
json_out = [] # Initialize list to hold processed JSON data from all files
|
27 |
+
|
28 |
+
# Calculate total number of files in the input folder to set up the progress bar
|
29 |
+
total_files = sum([len(files) for r, d, files in os.walk(folder_in)])
|
30 |
+
|
31 |
+
# Initialize progress bar with total file count, description, and unit of progress
|
32 |
+
with tqdm(total=total_files, desc='Processing', unit='file') as pbar:
|
33 |
+
# Iterate through all files in the input folder
|
34 |
+
for subdir, _, files in os.walk(folder_in):
|
35 |
+
# Set progress bar postfix to display current directory
|
36 |
+
pbar.set_postfix_str(f"Directory: {subdir}", refresh=False)
|
37 |
+
|
38 |
+
for file in files:
|
39 |
+
# Update progress bar postfix to display current file and directory
|
40 |
+
pbar.set_postfix_str(f"Dir: {subdir} | File: {file}", refresh=True)
|
41 |
+
|
42 |
+
# Create full file path for the current file
|
43 |
+
file_path = Path(subdir) / file
|
44 |
+
|
45 |
+
# Open and read the current file
|
46 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
47 |
+
for line in f:
|
48 |
+
# Load JSON data from each line and process it
|
49 |
+
article = json.loads(line)
|
50 |
+
# Ensure the preprocess function is defined and accessible
|
51 |
+
processed_article = preprocess(article)
|
52 |
+
# Add processed data to the output list
|
53 |
+
json_out.extend(processed_article)
|
54 |
+
|
55 |
+
# Update progress bar after processing each file
|
56 |
+
pbar.update(1)
|
57 |
+
|
58 |
+
# Notify that the writing process is starting
|
59 |
+
pbar.write("Writing file!")
|
60 |
+
# Open the output file and write the processed data as JSON
|
61 |
+
with open(json_path, "w", encoding='utf-8') as outfile:
|
62 |
+
json.dump(json_out, outfile)
|
63 |
+
# Notify that the writing process is complete
|
64 |
+
pbar.write("File written!")
|
65 |
+
|
66 |
+
# Return the list of processed data
|
67 |
+
return json_out
|
68 |
+
|
69 |
+
|
70 |
+
def preprocess(article: Dict[str, Any]) -> List[Dict[str, Any]]:
|
71 |
+
"""
|
72 |
+
Preprocess a given article dictionary, extracting and processing the 'text' field. Because of the `break` introduced
|
73 |
+
we are only taking the first chunk
|
74 |
+
|
75 |
+
Parameters:
|
76 |
+
article (Dict[str, Any]): Input dictionary containing an article. Expected to have a 'text' field.
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
List[Dict[str, Any]]: A list of dictionaries, where each dictionary represents a preprocessed chunk of
|
80 |
+
the original article's text. Each dictionary also contains the original article's
|
81 |
+
fields (excluding 'text'), with an additional 'chunk_number' field indicating the
|
82 |
+
order of the chunk.
|
83 |
+
|
84 |
+
Example:
|
85 |
+
article = {"text": "Example text", "title": "Example Title", "author": "John Doe"}
|
86 |
+
processed = preprocess(article)
|
87 |
+
print(processed)
|
88 |
+
"""
|
89 |
+
|
90 |
+
# Create a new dictionary excluding the 'text' field from the original article
|
91 |
+
article_out = {k: v for k, v in article.items() if k != 'text'}
|
92 |
+
|
93 |
+
# Create a prefix using the article's text. Adjust this line as needed based on the actual structure of 'article'
|
94 |
+
prefix = f'عنوان: {article["text"]}. '
|
95 |
+
out = [] # Initialize the list to hold the preprocessed chunks
|
96 |
+
|
97 |
+
# Iterate over chunks obtained by splitting the article's text using the group_arabic_paragraphs function
|
98 |
+
# Ensure group_arabic_paragraphs is defined and accessible
|
99 |
+
for i, chunk in enumerate(group_arabic_paragraphs(article['text'], MAX_WORDS)):
|
100 |
+
# Concatenate the prefix with the current chunk
|
101 |
+
chunk = prefix + chunk
|
102 |
+
# Create a new dictionary with the chunk, original article fields (excluding 'text'), and chunk number
|
103 |
+
# Then append this dictionary to the 'out' list
|
104 |
+
out.append({'chunk': chunk, **article_out, 'chunk_number': i})
|
105 |
+
# Only take the first chunk
|
106 |
+
break
|
107 |
+
|
108 |
+
# Return the list of preprocessed chunks
|
109 |
+
return out
|
110 |
+
|
111 |
+
|
112 |
+
def group_arabic_paragraphs(arabic_text: str, max_words: int) -> List[str]:
|
113 |
+
"""
|
114 |
+
Group contiguous paragraphs of Arabic text without exceeding the max_words limit per group.
|
115 |
+
|
116 |
+
Parameters:
|
117 |
+
arabic_text (str): The input Arabic text where paragraphs are separated by newlines.
|
118 |
+
max_words (int): The maximum number of words allowed per group of paragraphs.
|
119 |
+
|
120 |
+
Returns:
|
121 |
+
List[str]: A list of strings where each string is a group of contiguous paragraphs.
|
122 |
+
|
123 |
+
Example:
|
124 |
+
arabic_text = "Paragraph1.\nParagraph2.\nParagraph3."
|
125 |
+
max_words = 5
|
126 |
+
result = group_arabic_paragraphs(arabic_text, max_words)
|
127 |
+
print(result) # Output will depend on word count of each paragraph and max_words.
|
128 |
+
"""
|
129 |
+
|
130 |
+
# Splitting the input text into paragraphs using newline as a delimiter
|
131 |
+
paragraphs = arabic_text.split('\n')
|
132 |
+
|
133 |
+
# Initialize variables to hold the grouped paragraphs and word count
|
134 |
+
grouped_paragraphs = []
|
135 |
+
current_group = []
|
136 |
+
current_word_count = 0
|
137 |
+
|
138 |
+
# Iterate through each paragraph in the input text
|
139 |
+
for paragraph in paragraphs:
|
140 |
+
# Count the number of words in the paragraph
|
141 |
+
word_count = len(paragraph.split())
|
142 |
+
|
143 |
+
# If adding the paragraph won't exceed the word limit, add it to the current group
|
144 |
+
if current_word_count + word_count <= max_words:
|
145 |
+
current_group.append(paragraph)
|
146 |
+
current_word_count += word_count # Update the word count for the current group
|
147 |
+
else:
|
148 |
+
# If the paragraph exceeds the word limit, start a new group
|
149 |
+
if current_group:
|
150 |
+
grouped_paragraphs.append('\n'.join(current_group))
|
151 |
+
# Initialize a new group with the current paragraph
|
152 |
+
current_group = [paragraph]
|
153 |
+
current_word_count = word_count # Reset the word count for the new group
|
154 |
+
|
155 |
+
# Add the last group if not empty
|
156 |
+
if current_group:
|
157 |
+
grouped_paragraphs.append('\n'.join(current_group))
|
158 |
+
|
159 |
+
# Return the grouped paragraphs as a list of strings
|
160 |
+
return grouped_paragraphs
|
161 |
+
|
162 |
+
|
163 |
+
if __name__ == '__main__':
|
164 |
+
folder = Path('output')
|
165 |
+
file_out = Path('arwiki.json')
|
166 |
+
folder_to_json(folder, file_out)
|
167 |
+
print('Done!')
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
wikiextractor==3.0.6
|