Zamanonymize3

Sleeping

jfrery-zama commited on Mar 22

Commit

1dfccc3

•

1 Parent(s): 7552fa2

update representation with roberta + new fast model

Files changed (6) hide show

deployment/client.zip CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:147cf22982f4eb5198ad222bc290c0b3c36d5e15969294eb6a0a6f203c692c78
-size 129874

 version https://git-lfs.github.com/spec/v1
+oid sha256:81c8de4328853bc4b3df668ea1a174b6ed4d9d086c1a2cf544e3db639ac43b92
+size 30438

deployment/server.zip CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d24632c72f335b8bff44a62fde127eaf94e6eb21c9b2343c74fdcb2abdbf5747
-size 5637

 version https://git-lfs.github.com/spec/v1
+oid sha256:52100230351a158351fdede68d1bca31e1473e1f9f8abe6a71a0a95c7191b18e
+size 6333

deployment/versions.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"concrete-python": "2.5.1", "concrete-ml": "1.5.0-rc0", "python": "3.8.16"}


1	+ {"concrete-python": "2.5.1", "concrete-ml": "1.5.0-rc0", "python": "3.10.12"}

fhe_anonymizer.py CHANGED Viewed

@@ -5,6 +5,8 @@ from pathlib import Path
 from concrete.ml.common.serialization.loaders import load
 import uuid
 import json
 base_dir = Path(__file__).parent
@@ -12,9 +14,10 @@ base_dir = Path(__file__).parent
 class FHEAnonymizer:
     def __init__(self, punctuation_list=".,!?:;"):
-        self.embeddings_model = gensim.models.FastText.load(
-            str(base_dir / "models/without_pronoun_embedded_model.model")
-        )
         self.punctuation_list = punctuation_list
         with open(base_dir / "models/without_pronoun_cml_xgboost.model", "r") as model_file:
             self.fhe_ner_detection = load(file=model_file)
@@ -49,7 +52,8 @@ class FHEAnonymizer:
                 continue
             # Prediction for each word
-            x = self.embeddings_model.wv[token][None]
             # prediction_proba = self.fhe_ner_detection.predict_proba(x)
             prediction_proba = self.fhe_inference(x)
             probability = prediction_proba[0][1]

 from concrete.ml.common.serialization.loaders import load
 import uuid
 import json
+from transformers import AutoTokenizer, AutoModel
+from utils_demo import get_batch_text_representation
 base_dir = Path(__file__).parent
 class FHEAnonymizer:
     def __init__(self, punctuation_list=".,!?:;"):
+        # Load tokenizer and model, move model to the selected device
+        self.tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
+        self.embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
         self.punctuation_list = punctuation_list
         with open(base_dir / "models/without_pronoun_cml_xgboost.model", "r") as model_file:
             self.fhe_ner_detection = load(file=model_file)
                 continue
             # Prediction for each word
+            x = get_batch_text_representation([token], self.embeddings_model, self.tokenizer)
             # prediction_proba = self.fhe_ner_detection.predict_proba(x)
             prediction_proba = self.fhe_inference(x)
             probability = prediction_proba[0][1]

models/cml_xgboost.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:431175c3c2bd7591ebfffa3ea45b1096dda5ba7588291252994f9be31db35534
-size 6625266

 version https://git-lfs.github.com/spec/v1
+oid sha256:ea897cbdde4c99e439b788615a664083cfd57c0b3259407c0830a70e5fb45b05
+size 82842

utils_demo.py CHANGED Viewed

@@ -1,24 +1,22 @@
-import uuid
-def process_tokens(tokens, inverse_uuid_map=None, uuid_map=None, embeddings_model=None, fhe_ner_detection=None, client=None):
-    """Processes tokens based on the provided parameters for either deanonymizing, anonymizing or default processing."""
-    processed_tokens = []
-    for token in tokens:
-        if not token.strip() or not re.match(r"\w+", token):  # Directly append non-word tokens or whitespace
-            processed_tokens.append(token)
-            continue
-        if inverse_uuid_map is not None:  # For deanonymizing response
-            processed_tokens.append(inverse_uuid_map.get(token, token))
-        elif uuid_map is not None and embeddings_model is not None and fhe_ner_detection is not None and client is not None:  # For FHEAnonymizer call
-            x = embeddings_model.wv[token][None]
-            prediction_proba = fhe_ner_detection.predict_proba(x)
-            probability = prediction_proba[0][1]
-            if probability >= 0.5:
-                tmp_uuid = uuid_map.get(token, str(uuid.uuid4())[:8])
-                processed_tokens.append(tmp_uuid)
-                uuid_map[token] = tmp_uuid
-            else:
-                processed_tokens.append(token)
-        else:
-            processed_tokens.append(token)
-    return ''.join(processed_tokens)

+import torch
+import numpy as np
+import random
+def get_batch_text_representation(texts, model, tokenizer, batch_size=1):
+    """
+    Get mean-pooled representations of given texts in batches.
+    """
+    mean_pooled_batch = []
+    for i in range(0, len(texts), batch_size):
+        batch_texts = texts[i:i+batch_size]
+        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
+        with torch.no_grad():
+            outputs = model(**inputs, output_hidden_states=False)
+        last_hidden_states = outputs.last_hidden_state
+        input_mask_expanded = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_states.size()).float()
+        sum_embeddings = torch.sum(last_hidden_states * input_mask_expanded, 1)
+        sum_mask = input_mask_expanded.sum(1)
+        mean_pooled = sum_embeddings / sum_mask
+        mean_pooled_batch.extend(mean_pooled.cpu().detach().numpy())
+    return np.array(mean_pooled_batch)