YchKhan commited on
Commit
533a642
1 Parent(s): ee041e5

Update classification.py

Browse files
Files changed (1) hide show
  1. classification.py +5 -34
classification.py CHANGED
@@ -171,46 +171,17 @@ def process_categories(categories, model):
171
 
172
 
173
  def match_categories(df, category_df, treshold=0.45):
174
-
175
- categories_list, experts_list, topic_list, scores_list = [], [], [], []
176
- for ebd_content in df['Embeddings']:
177
  if isinstance(ebd_content, torch.Tensor):
178
  cos_scores = util.cos_sim(ebd_content, torch.stack(list(category_df['Embeddings']), dim=0))[0]
179
  high_score_indices = [i for i, score in enumerate(cos_scores) if score > treshold]
180
-
181
- # Append the corresponding categories, experts, and topics for each high-scoring index
182
- categories_list.append([category_df.loc[index, 'description'] for index in high_score_indices])
183
- experts_list.append([category_df.loc[index, 'experts'] for index in high_score_indices])
184
- topic_list.append([category_df.loc[index, 'topic'] for index in high_score_indices])
185
- scores_list.append([float(cos_scores[index]) for index in high_score_indices])
186
- else:
187
- categories_list.append(np.nan)
188
- experts_list.append(np.nan)
189
- topic_list.append(np.nan)
190
- scores_list.append('pas interessant')
191
-
192
- df["Description"] = categories_list
193
- df["Expert"] = experts_list
194
- df["Topic"] = topic_list
195
- df["Score"] = scores_list
196
  return df
197
 
198
- def flatten_nested_lists(nested_list):
199
- """Flatten a list of potentially nested lists into a single list."""
200
- flattened_list = []
201
- for item in nested_list:
202
- if isinstance(item, list):
203
- flattened_list.extend(flatten_nested_lists(item)) # Recursively flatten the list
204
- else:
205
- flattened_list.append(item)
206
- return flattened_list
207
-
208
  def save_data(df, filename):
209
- # Apply flattening and then join for the 'Expert' column
210
- df['Expert'] = df['Expert'].apply(lambda x: ', '.join(flatten_nested_lists(x)) if isinstance(x, list) else x)
211
- df['Description'] = df['Description'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
212
- df['Topic'] = df['Topic'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
213
- df['Score'] = df['Score'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x)
214
 
215
  df = df.drop(columns=['Embeddings'])
216
  new_filename = filename.replace(".", "_classified.")
 
171
 
172
 
173
  def match_categories(df, category_df, treshold=0.45):
174
+ for topic in category_df['topic']:
175
+ df[topic] = 0
176
+ for i, ebd_content in enumerate(df['Embeddings']):
177
  if isinstance(ebd_content, torch.Tensor):
178
  cos_scores = util.cos_sim(ebd_content, torch.stack(list(category_df['Embeddings']), dim=0))[0]
179
  high_score_indices = [i for i, score in enumerate(cos_scores) if score > treshold]
180
+ for j in high_score_indices:
181
+ df.loc[i, category_df.loc[j, 'topic']] = float(cos_scores[index])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  return df
183
 
 
 
 
 
 
 
 
 
 
 
184
  def save_data(df, filename):
 
 
 
 
 
185
 
186
  df = df.drop(columns=['Embeddings'])
187
  new_filename = filename.replace(".", "_classified.")