# import streamlit as st # import os # import json # import re # import datasets # import tiktoken # import zipfile # from pathlib import Path # # 定义 tiktoken 编码器 # encoding = tiktoken.get_encoding("cl100k_base") # _CITATION = """\ # @InProceedings{huggingface:dataset, # title = {MGT detection}, # author={Trustworthy AI Lab}, # year={2024} # } # """ # _DESCRIPTION = """\ # For detecting machine generated text. # """ # _HOMEPAGE = "" # _LICENSE = "" # # MGTHuman 类 # class MGTHuman(datasets.GeneratorBasedBuilder): # VERSION = datasets.Version("1.0.0") # BUILDER_CONFIGS = [ # datasets.BuilderConfig(name="human", version=VERSION, description="This part of human data"), # datasets.BuilderConfig(name="Moonshot", version=VERSION, description="Data from the Moonshot model"), # datasets.BuilderConfig(name="gpt35", version=VERSION, description="Data from the gpt-3.5-turbo model"), # datasets.BuilderConfig(name="Llama3", version=VERSION, description="Data from the Llama3 model"), # datasets.BuilderConfig(name="Mixtral", version=VERSION, description="Data from the Mixtral model"), # datasets.BuilderConfig(name="Qwen", version=VERSION, description="Data from the Qwen model"), # ] # DEFAULT_CONFIG_NAME = "human" # def _info(self): # features = datasets.Features( # { # "id": datasets.Value("int32"), # "text": datasets.Value("string"), # "file": datasets.Value("string"), # } # ) # return datasets.DatasetInfo( # description=_DESCRIPTION, # features=features, # homepage=_HOMEPAGE, # license=_LICENSE, # citation=_CITATION, # ) # def truncate_text(self, text, max_tokens=2048): # tokens = encoding.encode(text, allowed_special={'<|endoftext|>'}) # if len(tokens) > max_tokens: # tokens = tokens[:max_tokens] # truncated_text = encoding.decode(tokens) # last_period_idx = truncated_text.rfind('。') # if last_period_idx == -1: # last_period_idx = truncated_text.rfind('.') # if last_period_idx != -1: # truncated_text = truncated_text[:last_period_idx + 1] # return truncated_text # else: # return text # def get_text_by_index(self, filepath, index, cut_tokens=False, max_tokens=2048): # count = 0 # with open(filepath, 'r') as f: # data = json.load(f) # for row in data: # if not row["text"].strip(): # continue # if count == index: # text = row["text"] # if cut_tokens: # text = self.truncate_text(text, max_tokens) # return text # count += 1 # return "Index 超出范围,请输入有效的数字。" # def count_entries(self, filepath): # """返回文件中的总条数,用于动态生成索引范围""" # count = 0 # with open(filepath, 'r') as f: # data = json.load(f) # for row in data: # if row["text"].strip(): # count += 1 # return count # # Streamlit UI # st.title("MGTHuman Dataset Viewer") # # 上传包含 JSON 文件的 ZIP 文件 # uploaded_folder = st.file_uploader("上传包含 JSON 文件的 ZIP 文件夹", type=["zip"]) # if uploaded_folder: # folder_path = Path("temp") # folder_path.mkdir(exist_ok=True) # zip_path = folder_path / uploaded_folder.name # with open(zip_path, "wb") as f: # f.write(uploaded_folder.getbuffer()) # with zipfile.ZipFile(zip_path, 'r') as zip_ref: # zip_ref.extractall(folder_path) # # 递归获取所有 JSON 文件并分类到不同的 domain # category = {} # for json_file in folder_path.rglob("*.json"): # 使用 rglob 递归查找所有 JSON 文件 # domain = json_file.stem.split('_task3')[0] # category.setdefault(domain, []).append(str(json_file)) # # 显示可用的 domain 下拉框 # if category: # selected_domain = st.selectbox("选择数据种类", options=list(category.keys())) # # 确定该 domain 的第一个文件路径并获取条目数量 # file_to_display = category[selected_domain][0] # mgt_human = MGTHuman(name=selected_domain) # total_entries = mgt_human.count_entries(file_to_display) # st.write(f"可用的索引范围: 0 到 {total_entries - 1}") # # 输入序号查看文本 # index_to_view = st.number_input("输入要查看的文本序号", min_value=0, max_value=total_entries - 1, step=1) # # 添加复选框以选择是否切割文本 # cut_tokens = st.checkbox("是否对文本进行token切割", value=False) # if st.button("显示文本"): # text = mgt_human.get_text_by_index(file_to_display, index=index_to_view, cut_tokens=cut_tokens) # st.write("对应的文本内容为:", text) # else: # st.write("未找到任何 JSON 文件,请检查 ZIP 文件结构。") # # 清理上传文件的临时目录 # if st.button("清除文件"): # import shutil # shutil.rmtree("temp") # st.write("临时文件已清除。") import streamlit as st from transformers import pipeline # Initialize Hugging Face text classifier @st.cache_resource # Cache the model to avoid reloading def load_model(): # Use a Hugging Face pre-trained text classification model # Replace with a suitable model if necessary classifier = pipeline("text-classification", model="roberta-base-openai-detector") return classifier st.title("Machine-Generated Text Detector") st.write("Enter a text snippet, and I will analyze it to determine if it is likely written by a human or generated by a machine.") # Load the model classifier = load_model() # Input text input_text = st.text_area("Enter text here:", height=150) # Button to trigger detection if st.button("Analyze"): if input_text: # Make prediction result = classifier(input_text) # Extract label and confidence score label = result[0]['label'] score = result[0]['score'] * 100 # Convert to percentage for readability # Display result if label == "LABEL_1": st.write(f"**Result:** This text is likely **Machine-Generated**.") else: st.write(f"**Result:** This text is likely **Human-Written**.") # Display confidence score st.write(f"**Confidence Score:** {score:.2f}%") else: st.write("Please enter some text for analysis.")