commited on
Browse files- tabs/ +135 -135
@@ -94,169 +94,169 @@ full_txt_fr = load_data(dataPath+'/small_vocab_fr')
94 |
if not st.session_state.reCalcule:
95 |
full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en = load_all_preprocessed_data('en')
96 |
full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr = load_all_preprocessed_data('fr')
97 |
98 |
99 |
100 |
stop_words = set(stopwords.words(lang))
101 |
# stop_words will contain set all english stopwords
102 |
filtered_sentence = []
103 |
for word in text.split():
104 |
if word not in stop_words:
105 |
106 |
return " ".join(filtered_sentence)
107 |
108 |
109 |
110 |
# Removing URLs
111 |
sentence = re.sub(r"https?://\S+|www\.\S+", "", sentence )
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
def clean_untranslated_sentence(data1, data2):
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
import spacy
146 |
147 |
nlp_en = spacy.load('en_core_web_sm')
148 |
nlp_fr = spacy.load('fr_core_news_sm')
149 |
150 |
151 |
def lemmatize(sentence,lang):
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
def preprocess_txt (data, lang):
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
if lemmatize_to_do:
214 |
n_batch = 12
215 |
batch_size = round((nb_phrases/ n_batch)+0.5)
216 |
for i in range(n_batch):
217 |
to_lem = ".".join([s for s in data[i*batch_size:(i+1)*batch_size]])
218 |
data_lemmatized = data_lemmatized+"."+lemmatize(to_lem,lang).lower()
219 |
220 |
data_lem_for_sw = data_lemmatized[1:]
221 |
data_lemmatized = data_lem_for_sw.split('.')
222 |
for i in range(nb_phrases):
223 |
224 |
225 |
226 |
227 |
# Elimination des StopWords en un lot
228 |
# On élimine les Stopwords des phrases lémmatisés, si cette phase a eu lieu
229 |
# (wosw signifie "WithOut Stop Words")
230 |
if stopwords_to_do:
231 |
if lemmatize_to_do:
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
def count_world(data):
94 |
if not st.session_state.reCalcule:
95 |
full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en = load_all_preprocessed_data('en')
96 |
full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr = load_all_preprocessed_data('fr')
97 |
98 |
99 |
def remove_stopwords(text, lang):
100 |
stop_words = set(stopwords.words(lang))
101 |
# stop_words will contain set all english stopwords
102 |
filtered_sentence = []
103 |
for word in text.split():
104 |
if word not in stop_words:
105 |
106 |
return " ".join(filtered_sentence)
107 |
108 |
def clean_undesirable_from_text(sentence, lang):
109 |
110 |
# Removing URLs
111 |
sentence = re.sub(r"https?://\S+|www\.\S+", "", sentence )
112 |
113 |
# Removing Punctuations (we keep the . character)
114 |
REPLACEMENTS = [("..", "."),
115 |
(",", ""),
116 |
(";", ""),
117 |
(":", ""),
118 |
("?", ""),
119 |
('"', ""),
120 |
("-", " "),
121 |
("it's", "it is"),
122 |
("isn't","is not"),
123 |
("'", " ")
124 |
125 |
for old, new in REPLACEMENTS:
126 |
sentence = sentence.replace(old, new)
127 |
128 |
# Removing Digits
129 |
sentence= re.sub(r'[0-9]','',sentence)
130 |
131 |
# Removing Additional Spaces
132 |
sentence = re.sub(' +', ' ', sentence)
133 |
134 |
return sentence
135 |
136 |
def clean_untranslated_sentence(data1, data2):
137 |
138 |
while i<len(data1):
139 |
if data1[i]==data2[i]:
140 |
141 |
142 |
else: i+=1
143 |
return data1,data2
144 |
145 |
import spacy
146 |
147 |
nlp_en = spacy.load('en_core_web_sm')
148 |
nlp_fr = spacy.load('fr_core_news_sm')
149 |
150 |
151 |
def lemmatize(sentence,lang):
152 |
# Create a Doc object
153 |
if lang=='en':
154 |
155 |
elif lang=='fr':
156 |
157 |
else: return
158 |
doc = nlp(sentence)
159 |
160 |
# Create list of tokens from given string
161 |
tokens = []
162 |
for token in doc:
163 |
164 |
165 |
lemmatized_sentence = " ".join([token.lemma_ for token in doc])
166 |
167 |
return lemmatized_sentence
168 |
169 |
170 |
def preprocess_txt (data, lang):
171 |
172 |
word_count = collections.Counter()
173 |
word_lem_count = collections.Counter()
174 |
word_wosw_count = collections.Counter()
175 |
corpus = []
176 |
data_split = []
177 |
sentence_length = []
178 |
data_split_wo_stopwords = []
179 |
data_length_wo_stopwords = []
180 |
data_lem = []
181 |
data_lem_length = []
182 |
183 |
txt_en_one_string= ". ".join([s for s in data])
184 |
txt_en_one_string = txt_en_one_string.replace('..', '.')
185 |
txt_en_one_string = " "+clean_undesirable_from_text(txt_en_one_string, 'lang')
186 |
data = txt_en_one_string.split('.')
187 |
if data[-1]=="":
188 |
189 |
for i in range(len(data)): # On enleve les ' ' qui commencent et finissent les phrases
190 |
if data[i][0] == ' ':
191 |
192 |
if data[i][-1] == ' ':
193 |
194 |
nb_phrases = len(data)
195 |
196 |
# Création d'un tableau de mots (sentence_split)
197 |
for i,sentence in enumerate(data):
198 |
sentence_split = word_tokenize(sentence)
199 |
200 |
201 |
202 |
203 |
# La lemmatisation et le nettoyage des stopword va se faire en batch pour des raisons de vitesse
204 |
# (au lieu de le faire phrase par phrase)
205 |
# Ces 2 processus nécéssitent de connaitre la langue du corpus
206 |
if lang == 'en': l='english'
207 |
elif lang=='fr': l='french'
208 |
else: l="unknown"
209 |
210 |
if l!="unknown":
211 |
# Lemmatisation en 12 lots (On ne peut lemmatiser + de 1 M de caractères à la fois)
212 |
213 |
if lemmatize_to_do:
214 |
n_batch = 12
215 |
batch_size = round((nb_phrases/ n_batch)+0.5)
216 |
for i in range(n_batch):
217 |
to_lem = ".".join([s for s in data[i*batch_size:(i+1)*batch_size]])
218 |
data_lemmatized = data_lemmatized+"."+lemmatize(to_lem,lang).lower()
219 |
220 |
data_lem_for_sw = data_lemmatized[1:]
221 |
data_lemmatized = data_lem_for_sw.split('.')
222 |
for i in range(nb_phrases):
223 |
224 |
225 |
226 |
227 |
# Elimination des StopWords en un lot
228 |
# On élimine les Stopwords des phrases lémmatisés, si cette phase a eu lieu
229 |
# (wosw signifie "WithOut Stop Words")
230 |
if stopwords_to_do:
231 |
if lemmatize_to_do:
232 |
data_wosw = remove_stopwords(data_lem_for_sw,l)
233 |
234 |
data_wosw = remove_stopwords(txt_en_one_string,l)
235 |
236 |
data_wosw = data_wosw.split('.')
237 |
for i in range(nb_phrases):
238 |
239 |
240 |
241 |
242 |
corpus = list(word_count.keys())
243 |
244 |
# Création d'un DataFrame txt_n_unique_val :
245 |
# colonnes = mots
246 |
# lignes = phases
247 |
# valeur de la cellule = nombre d'occurence du mot dans la phrase
248 |
249 |
## BOW
250 |
from sklearn.feature_extraction.text import CountVectorizer
251 |
count_vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 1), token_pattern=r"[^' ']+" )
252 |
253 |
# Calcul du nombre d'apparition de chaque mot dans la phrases
254 |
countvectors = count_vectorizer.fit_transform(data)
255 |
corpus = count_vectorizer.get_feature_names_out()
256 |
257 |
txt_n_unique_val= pd.DataFrame(columns=corpus,index=range(nb_phrases), data=countvectors.todense()).astype(float)
258 |
259 |
return data, corpus, data_split, data_lemmatized, data_wosw, txt_n_unique_val, sentence_length, data_length_wo_stopwords, data_lem_length
260 |
261 |
262 |
def count_world(data):