Spaces:
Running
on
Zero
Running
on
Zero
""" | |
note: this code is used in bw2ar.py file | |
""" | |
#!/usr/bin/python | |
# -*- coding=utf-8 -*- | |
#--- | |
# $Id: arabic.py,v 1.6 2003/04/22 17:18:22 elzubeir Exp $ | |
# | |
# ------------ | |
# Description: | |
# ------------ | |
# | |
# Arabic codes | |
# | |
# (C) Copyright 2003, Arabeyes, Mohammed Elzubeir | |
# (C) Copyright 2019, Faris Abdullah Alasmary | |
# ----------------- | |
# Revision Details: (Updated by Revision Control System) | |
# ----------------- | |
# $Date: 2003/04/22 17:18:22 $ | |
# $Author: elzubeir $ | |
# $Revision: 1.6 $ | |
# $Source: /home/arabeyes/cvs/projects/duali/pyduali/pyduali/arabic.py,v $ | |
# | |
# This program is written under the BSD License. | |
#--- | |
""" Constants for arabic """ | |
import re | |
COMMA = u'\u060C' | |
SEMICOLON = u'\u061B' | |
QUESTION = u'\u061F' | |
HAMZA = u'\u0621' | |
ALEF_MADDA = u'\u0622' | |
ALEF_HAMZA_ABOVE = u'\u0623' | |
WAW_HAMZA = u'\u0624' | |
ALEF_HAMZA_BELOW = u'\u0625' | |
YEH_HAMZA = u'\u0626' | |
ALEF = u'\u0627' | |
BEH = u'\u0628' | |
TEH_MARBUTA = u'\u0629' | |
TEH = u'\u062a' | |
THEH = u'\u062b' | |
JEEM = u'\u062c' | |
HAH = u'\u062d' | |
KHAH = u'\u062e' | |
DAL = u'\u062f' | |
THAL = u'\u0630' | |
REH = u'\u0631' | |
ZAIN = u'\u0632' | |
SEEN = u'\u0633' | |
SHEEN = u'\u0634' | |
SAD = u'\u0635' | |
DAD = u'\u0636' | |
TAH = u'\u0637' | |
ZAH = u'\u0638' | |
AIN = u'\u0639' | |
GHAIN = u'\u063a' | |
TATWEEL = u'\u0640' | |
FEH = u'\u0641' | |
QAF = u'\u0642' | |
KAF = u'\u0643' | |
LAM = u'\u0644' | |
MEEM = u'\u0645' | |
NOON = u'\u0646' | |
HEH = u'\u0647' | |
WAW = u'\u0648' | |
ALEF_MAKSURA = u'\u0649' | |
YEH = u'\u064a' | |
MADDA_ABOVE = u'\u0653' | |
HAMZA_ABOVE = u'\u0654' | |
HAMZA_BELOW = u'\u0655' | |
ZERO = u'\u0660' | |
ONE = u'\u0661' | |
TWO = u'\u0662' | |
THREE = u'\u0663' | |
FOUR = u'\u0664' | |
FIVE = u'\u0665' | |
SIX = u'\u0666' | |
SEVEN = u'\u0667' | |
EIGHT = u'\u0668' | |
NINE = u'\u0669' | |
PERCENT = u'\u066a' | |
DECIMAL = u'\u066b' | |
THOUSANDS = u'\u066c' | |
STAR = u'\u066d' | |
MINI_ALEF = u'\u0670' | |
ALEF_WASLA = u'\u0671' | |
FULL_STOP = u'\u06d4' | |
BYTE_ORDER_MARK = u'\ufeff' | |
# Diacritics | |
FATHATAN = u'\u064b' | |
DAMMATAN = u'\u064c' | |
KASRATAN = u'\u064d' | |
FATHA = u'\u064e' | |
DAMMA = u'\u064f' | |
KASRA = u'\u0650' | |
SHADDA = u'\u0651' | |
SUKUN = u'\u0652' | |
#Ligatures | |
LAM_ALEF = u'\ufefb' | |
LAM_ALEF_HAMZA_ABOVE = u'\ufef7' | |
LAM_ALEF_HAMZA_BELOW = u'\ufef9' | |
LAM_ALEF_MADDA_ABOVE = u'\ufef5' | |
SIMPLE_LAM_ALEF = u'\u0644\u0627' | |
SIMPLE_LAM_ALEF_HAMZA_ABOVE = u'\u0644\u0623' | |
SIMPLE_LAM_ALEF_HAMZA_BELOW = u'\u0644\u0625' | |
SIMPLE_LAM_ALEF_MADDA_ABOVE = u'\u0644\u0622' | |
HARAKAT_PAT = re.compile(u"["+u"".join([FATHATAN, DAMMATAN, KASRATAN, | |
FATHA, DAMMA, KASRA, SUKUN, | |
SHADDA])+u"]") | |
HAMZAT_PAT = re.compile(u"["+u"".join([WAW_HAMZA, YEH_HAMZA])+u"]") | |
ALEFAT_PAT = re.compile(u"["+u"".join([ALEF_MADDA, ALEF_HAMZA_ABOVE, | |
ALEF_HAMZA_BELOW, HAMZA_ABOVE, | |
HAMZA_BELOW])+u"]") | |
LAMALEFAT_PAT = re.compile(u"["+u"".join([LAM_ALEF, | |
LAM_ALEF_HAMZA_ABOVE, | |
LAM_ALEF_HAMZA_BELOW, | |
LAM_ALEF_MADDA_ABOVE])+u"]") | |
def strip_tashkeel(text): | |
text = HARAKAT_PAT.sub('', text) | |
text = re.sub(u"[\u064E]", "", text, flags=re.UNICODE) # fattha | |
text = re.sub(u"[\u0671]", "", text, flags=re.UNICODE) # waSla | |
return text | |
def strip_tatweel(text): | |
return re.sub(u'[%s]' % TATWEEL, '', text) | |
# remove removing Tashkeel + removing Tatweel + non Arabic chars | |
def remove_non_arabic(text): | |
text = strip_tashkeel(text) | |
text = strip_tatweel(text) | |
return ' '.join(re.sub(u"[^\u0621-\u063A\u0641-\u064A ]", " ", text, flags=re.UNICODE).split()) | |