Spaces:

ThornRugal
/

ChinesePrivacyPolicyMark

Sleeping

App Files Files Community

ThornRugal commited on 15 days ago

Commit

e5451b9

•

1 Parent(s): 32cabe6

Add application files and updated Readme

Browse files

Files changed (4) hide show

.gitattributes +1 -0
README.md +16 -13
app.py +170 -0
data_Excel_format.xlsx +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data_Excel_format.xlsx filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,16 @@
----
-title: ChinesePrivacyPolicyMark
-emoji: 👁
-colorFrom: gray
-colorTo: purple
-sdk: gradio
-sdk_version: 5.5.0
-app_file: app.py
-pinned: false
-short_description: Mark Chinese Privacy Policy with Retrieve models
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: ChinesePrivacyPolicyMark
+emoji: 👁
+colorFrom: gray
+colorTo: purple
+sdk: gradio
+sdk_version: 5.5.0
+app_file: app.py
+pinned: false
+short_description: Mark Chinese Privacy Policy with Retrieve models
+---
+使用的数据地址：https://github.com/EnlightenedAI/CAPP-130<br>
+使用预训练好的模型检索预先保存好的隐私政策，以此标注隐私政策中的关键信息。<br>
+首先使用特征提取模型将隐私政策中的句子进行tokenize，将其与保存的向量对比进行一次“粗筛”，选取与其最为接近的n条记录。<br>
+之后使用文本相似度计算模型，将筛选出来的n条记录与原本的文本进行匹配，过滤出相似度高于阈值p的m条记录，将这m条记录所属的标记合并起来。<br>
+由于没有使用GPU，直接在Space中运行会很慢。有条件可以clone下来试试。

app.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import gradio as gr
+import warnings
+warnings.filterwarnings("ignore")
+import pandas as pd
+import numpy as np
+import faiss
+import ast
+import torch.nn.functional as F
+import torch
+from transformers import AutoModel, AutoTokenizer
+Encoding_model = 'jinaai/jina-embeddings-v2-base-zh'
+model = AutoModel.from_pretrained(Encoding_model, trust_remote_code=True, torch_dtype=torch.bfloat16)
+model#.to("cuda")
+similarity_model = 'Alibaba-NLP/gte-multilingual-base'
+similarity_tokenizer = AutoTokenizer.from_pretrained(similarity_model)
+similarity_model = AutoModel.from_pretrained(similarity_model, trust_remote_code=True)#.to("cuda")
+def get_not_empty_data(df,x_column="text",y_column="label"):
+    df = df[df[y_column] != "[]"].reset_index(drop=True)
+    res_dict = {}
+    for idx in df.index:
+        if df.loc[idx,x_column] not in res_dict:
+            res_dict[df.loc[idx,x_column]] = ast.literal_eval(df.loc[idx,y_column])
+        else:
+            res_dict[df.loc[idx,x_column]] += ast.literal_eval(df.loc[idx,y_column])
+    res_dict = {k:list(set(v)) for k,v in res_dict.items()}
+    df_dict = pd.DataFrame({"x":res_dict.keys(),"y":res_dict.values()})
+    return df_dict
+data_all = pd.read_excel("data_Excel_format.xlsx")
+df_dict_all = get_not_empty_data(data_all)
+x_dict = df_dict_all["x"].values
+y_dict = df_dict_all["y"].values
+def calc_scores(x):
+    return (x[:1] @ x[1:].T)
+def get_idxs(threshold,max_len,arr):
+    res = np.where(arr >= threshold)[0]
+    if len(res)<max_len:
+        return res
+    res = res[np.argsort(-arr[res])][:3]
+    return res
+def merge_set_to_list(set_list):
+    res = set()
+    for i in set_list:
+        res = res | i
+    return res
+def get_predict_result(index,score,threshold,max_len):
+    score = score.flatten()
+    index = index.flatten()
+    index_of_index = np.where(score >= threshold)[0]
+    if len(index_of_index)>=max_len:
+        index_of_index = index_of_index[np.argsort(-index[index_of_index])][:3]
+    if len(index_of_index)==0:
+        return {},[]
+    res_index = index[index_of_index]
+    res = merge_set_to_list([set(i) for i in y_dict[res_index]])
+    return res,x_dict[res_index]
+vec = np.empty(shape=[0,768],dtype="float32")
+bsize = 256
+with torch.no_grad():
+    for i in range(0,len(x),bsize):
+        tmp = model.encode(x[i:i+bsize])
+        vec = np.concatenate([vec,tmp])
+index = faiss.IndexFlatIP(768)
+faiss.normalize_L2(vec)
+index.add(vec)
+faiss.write_index(index,"all_index.faiss")
+index = faiss.read_index("all_index.faiss")
+def predict_label(x,threshold=0.85,n_nearest=10,max_result_len=3):
+    bsize=1
+    y_pred = []
+    with torch.no_grad():
+        for i in range(0,len(x),bsize):
+            sentences = x[i:i+bsize]
+            vec = model.encode(sentences)
+            faiss.normalize_L2(vec)
+            scores, indexes = index.search(vec,n_nearest)
+            x_pred = np.array([[sentences[j]]+s.tolist() for j,s in enumerate(x_dict[indexes])])
+            batch_dict = similarity_tokenizer(x_pred.flatten().tolist(), max_length=768, padding=True, truncation=True, return_tensors='pt')#.to("cuda")
+            outputs = similarity_model(**batch_dict)
+            dimension=768
+            embeddings = outputs.last_hidden_state[:, 0][:dimension]
+            embeddings = F.normalize(embeddings, p=2, dim=1)
+            embeddings = embeddings.view(len(x_pred),n_nearest+1,dimension).detach().cpu().numpy()
+            scores = [calc_scores(embeddings[b]) for b in range(embeddings.shape[0])]
+            pred = [get_predict_result(indexes[k],scores[k],threshold=threshold,max_len=max_result_len) for k in range(len(scores))]
+            y_pred.append([i[0] for i in pred])
+    return y_pred
+CSS_Content = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <style>
+        #custom_id {
+            border: 2px solid red;
+            padding: 10px;
+            background-color: lightgray;
+        }
+    </style>
+</head>
+</html>
+<span style="color: red;line-height:1;">红色字体：潜在风险</span><br>
+<span style="color: blue;line-height:1;">蓝色字体：权限获取</span><br>
+<span style="color: purple;line-height:1;">紫色字体：数据收集</span><br>
+<span style="color: green;line-height:1;">绿色字体：数据、权限管理</span><br>
+<span style="color: brown;line-height:1;">棕色字体：共享、委托、转让、公开（披露）</span><br>
+"""
+color_dict = {"潜在风险":"red",
+              "权限获取":"blue",
+              "数据收集":"purple",
+              "数据、权限管理":"green",
+              "共享、委托、转让、公开（披露）":"brown"
+             }
+def generate_HTML(text,threshold=0.85,n_nearest=10,max_result_len=3):
+    sentences = text.split("\n")
+    sentences = [i for i in map(lambda x:x.split("。"),sentences)]
+    res = CSS_Content
+    for paragraph in sentences:
+        tmp_res = []
+        pred_label = predict_label(paragraph,threshold,n_nearest,max_result_len)
+        for i,x in enumerate(pred_label):
+            pre = "<span"
+            if len(x[0])>0:
+                for j in color_dict.keys(): #color dict重要性递减，所以只取第一个标签的颜色
+                    if j in x[0]:
+                        pre += f' style="color: {color_dict[j]};line-height:1;"'
+                        break
+            tmp_res.append(pre+">"+paragraph[i]+"</span>")
+        res += "。".join(tmp_res)
+        res += "<br>"
+    return res
+with gr.Blocks() as demo:
+    with gr.Row():
+        input_text = gr.Textbox(lines=25,label="输入")
+    with gr.Row():
+        threshold = gr.Slider(minimum=0.5,maximum=0.85,value=0.75,step=0.05,interactive=True,label="相似度阈值")
+        n_nearest = gr.Slider(minimum=3,maximum=10,value=10,step=1,interactive=True,label="粗筛语句数量")
+        max_result_len = gr.Slider(minimum=1,maximum=5,value=3,step=1,interactive=True,label="精筛语句数量")
+    with gr.Row():
+        submit_button = gr.Button("检测")
+    with gr.Row():
+        output_text = gr.HTML(CSS_Content)
+        output_text.elem_id="custom_id"
+    submit_button.click(fn=generate_HTML, inputs=[input_text,threshold,n_nearest,max_result_len], outputs=output_text)
+demo.launch()

data_Excel_format.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db4b6d314555c48bc00053a4e581960e1991625d7962f3b88e00dd04c3233a6b
+size 2846032