Spaces:
No application file
No application file
import json | |
import csv | |
from translate import Translator | |
#print("translate imported") | |
import re | |
def load_regex_pattern(filename): | |
try: | |
with open(filename,'r',encoding="utf-8") as config_file: | |
config_data=json.load(config_file) | |
return config_data | |
except FileNotFoundError: | |
print("regex file not found") | |
return{} | |
def translate_date(date_text): | |
translation_dict={ | |
"January":"يناير", "February":"فبراير", | |
"March":"مارس", | |
"April":"ابريل", | |
"May":"مايو", | |
"June":"يونيو", | |
"July":"يوليو", | |
"August":"أغسطس", | |
"September":"سبتمبر", | |
"October":"اكتوبر", | |
"November":"نوفمبر", | |
"ديسمبر":"December", | |
"٠":"0", | |
"١":"1", | |
"٢":"2", | |
"٣":"3", | |
"٤":"4", | |
"٥":"5", | |
"٦":"6", | |
"٧":"7", | |
"٨":"8", | |
"٩":"9" | |
} | |
#mapping digits and months into english | |
for ar_digit,en_digit in translation_dict.items(): | |
date_text=date_text.replace(ar_digit,en_digit) | |
for ar_month,en_month in translation_dict.items(): | |
date_text=date_text.replace(ar_month,en_month) | |
return date_text | |
def translate_text(text): | |
translator=Translator(to_lang='en',from_lang='ar') | |
translated_text=translator.translate(text) | |
return translated_text | |
def extract_and_store_info(input_file,output_csv,regex_patterns): | |
extracted_data={pattern_name:"" for pattern_name in regex_patterns} | |
with open(input_file,encoding="utf-8") as json_file: | |
json_data=json.load(json_file) | |
for pattern_name,pattern_data in regex_patterns.items(): | |
if not extracted_data.get(pattern_name): | |
for entry in json_data: | |
if "Arabic text" in entry: | |
text=entry.get("Arabic text","") | |
if not extracted_data.get(pattern_name) and re.search(pattern_data["pattern"],text,re.IGNORECASE): | |
extracted_data[pattern_name]=re.search(pattern_data["pattern"],text,re.IGNORECASE).group() | |
if extracted_data.get(pattern_name): | |
break | |
#translate the company name into english | |
if extracted_data.get("company_title"): | |
extracted_data["company_title"]=translate_text(extracted_data["company_title"]) | |
if "annual_pattern" in regex_patterns and "date_pattern" in regex_patterns: | |
if re.search(regex_patterns["annual_pattern"]["pattern"],extracted_data["annual_pattern"],re.IGNORECASE): | |
extracted_data["annual_pattern"]="Annual" | |
elif re.search(regex_patterns["half_annual_pattern"]["pattern"],extracted_data["half_annual_pattern"],re.IGNORECASE): | |
extracted_data["half_annual_pattern"]="Half Annual" | |
if "date_pattern" in regex_patterns: | |
extracted_data["date_pattern"]=translate_date(extracted_data["date_pattern"]) | |
with open(output_csv,mode='w',encoding="utf-8",newline='') as csv_output_file: | |
fieldnames=["pattern_name","extracted_data"] | |
writer=csv.DictWriter(csv_output_file,fieldnames=fieldnames) | |
writer.writeheader() | |
for pattern_name ,data in extracted_data.items(): | |
if data: | |
writer.writerow({"pattern_name":pattern_name,"extracted_data":data}) | |
extracted_count=sum(1 for data in extracted_data.values() if data) | |
print(f"{extracted_count} pieces of data extracted and stored in:",output_csv) | |
if __name__=="__main__": | |
input_file="cache/output/basic_info_frame.json" | |
output_csv_file='cache/output/outputregex.csv' | |
regex_patterns=load_regex_pattern("mainpipeline/models/regex_config.json") | |
if regex_patterns: | |
extract_and_store_info(input_file,output_csv_file,regex_patterns) | |
else: | |
print("failed to load regex") | |