srijaydeshpande commited on
Commit
3fe49a8
1 Parent(s): e0d79c9

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +214 -0
  2. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdfminer.high_level import extract_pages
2
+ from pdfminer.layout import LTTextContainer
3
+ from tqdm import tqdm
4
+ import re
5
+ import gradio as gr
6
+ import os
7
+ import accelerate
8
+ import spaces
9
+ import subprocess
10
+ from huggingface_hub import hf_hub_download
11
+ from llama_cpp import Llama
12
+ from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
13
+ from llama_cpp_agent.providers import LlamaCppPythonProvider
14
+ from llama_cpp_agent.chat_history import BasicChatHistory
15
+ from llama_cpp_agent.chat_history.messages import Roles
16
+ # from vllm import LLM, SamplingParams
17
+
18
+ # subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
19
+ # subprocess.run('pip install llama-cpp-agent==0.2.10', shell=True)
20
+
21
+
22
+ hf_hub_download(
23
+ repo_id="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF",
24
+ filename="Meta-Llama-3-8B-Instruct.Q8_0.gguf",
25
+ local_dir = "./models"
26
+ )
27
+
28
+ # hf_hub_download(
29
+ # repo_id="bartowski/Meta-Llama-3-70B-Instruct-GGUF",
30
+ # filename="Meta-Llama-3-70B-Instruct-Q3_K_M.gguf",
31
+ # local_dir = "./models"
32
+ # )
33
+
34
+ def process_document(pdf_path):
35
+ extracted_pages = extract_pages(pdf_path)
36
+ page2content = {}
37
+ for extracted_page in tqdm(extracted_pages):
38
+ page_id = extracted_page.pageid
39
+ content = process_page(extracted_page)
40
+ page2content[page_id] = content
41
+ return page2content
42
+
43
+
44
+ def process_page(extracted_page):
45
+ content = []
46
+ elements = [element for element in extracted_page._objs]
47
+ elements.sort(key=lambda a: a.y1, reverse=True)
48
+ for i, element in enumerate(elements):
49
+ if isinstance(element, LTTextContainer):
50
+ line_text = extract_text_and_normalize(element)
51
+ content.append(line_text)
52
+ content = re.sub('\n+', '\n', ''.join(content))
53
+ return content
54
+
55
+
56
+ def extract_text_and_normalize(element):
57
+ # Extract text from line and split it with new lines
58
+ line_texts = element.get_text().split('\n')
59
+ norm_text = ''
60
+ for line_text in line_texts:
61
+ line_text = line_text.strip()
62
+ if not line_text:
63
+ line_text = '\n'
64
+ else:
65
+ line_text = re.sub('\s+', ' ', line_text)
66
+ if not re.search('[\w\d\,\-]', line_text[-1]):
67
+ line_text += '\n'
68
+ else:
69
+ line_text += ' '
70
+ norm_text += line_text
71
+ return norm_text
72
+
73
+
74
+ def txt_to_html(text):
75
+ html_content = "<html><body>"
76
+ for line in text.split('\n'):
77
+ html_content += "<p>{}</p>".format(line.strip())
78
+ html_content += "</body></html>"
79
+ return html_content
80
+
81
+ def deidentify_doc(llm, pdftext, maxtokens, temperature, top_probability):
82
+
83
+ #### Remove Dates ###
84
+ prompt = "In the following text replace only the calendar dates with term [date]. Example: if input is 'Date of birth: 15/5/1959 calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)' output should be 'Date of birth: [date] calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)'"
85
+ output = llm.create_chat_completion(
86
+ messages=[
87
+ {"role": "assistant", "content": prompt},
88
+ {
89
+ "role": "user",
90
+ "content": pdftext
91
+ }
92
+ ],
93
+ max_tokens=maxtokens,
94
+ temperature=temperature
95
+ )
96
+ output = output['choices'][0]['message']['content']
97
+
98
+ # Remove starting header string in output
99
+ find_index = output.find(' '.join(pdftext.split()[:3]))
100
+ if find_index != -1:
101
+ output = output[find_index:].strip()
102
+
103
+
104
+
105
+ # #### Remove Locations and Addresses ###
106
+ prompt = "In the following text replace location or address, such as '3970 Longview Drive, CV36HE' with term [address]. Replace complete GP address, such as 'Phanton Medical Centre, Birmingham, CV36HE' with term [address]. It is important that all addresses are completely replaced with [address]."
107
+ output = llm.create_chat_completion(
108
+ messages=[
109
+ {"role": "assistant", "content": prompt},
110
+ {
111
+ "role": "user",
112
+ "content": output
113
+ }
114
+ ],
115
+ max_tokens=maxtokens,
116
+ temperature=temperature
117
+ )
118
+ output = output['choices'][0]['message']['content']
119
+
120
+
121
+ # Remove starting header string in output
122
+ find_index = output.find(' '.join(pdftext.split()[:3]))
123
+ if find_index != -1:
124
+ output = output[find_index:].strip()
125
+
126
+
127
+
128
+ #### Remove Names ###
129
+ prompt = "In the following text replace any person name with term [name]. It is important that all person names are replaced with term [name]. Remove any gender terms 'male' or 'female' if exists."
130
+ output = llm.create_chat_completion(
131
+ messages=[
132
+ {"role": "assistant", "content": prompt},
133
+ {
134
+ "role": "user",
135
+ "content": output
136
+ }
137
+ ],
138
+ max_tokens=maxtokens,
139
+ temperature=temperature
140
+ )
141
+ output = output['choices'][0]['message']['content']
142
+
143
+ # Remove starting header string in output
144
+ find_index = output.find(' '.join(pdftext.split()[:3]))
145
+ if find_index != -1:
146
+ output = output[find_index:].strip()
147
+
148
+
149
+
150
+
151
+ ### Remove Registration Numbers ###
152
+
153
+ prompt = "In the following text replace the nhs number and the case note number with term [ID]. Replace Hospital number with [ID]."
154
+ output = llm.create_chat_completion(
155
+ messages=[
156
+ {"role": "assistant", "content": prompt},
157
+ {
158
+ "role": "user",
159
+ "content": output
160
+ }
161
+ ],
162
+ max_tokens=maxtokens,
163
+ temperature=temperature
164
+ )
165
+ output = output['choices'][0]['message']['content']
166
+
167
+ # Remove starting header string in output
168
+ find_index = output.find(' '.join(pdftext.split()[:3]))
169
+ if find_index != -1:
170
+ output = output[find_index:].strip()
171
+
172
+ return output
173
+
174
+ @spaces.GPU(duration=80)
175
+ def pdf_to_text(files, maxtokens=2048, temperature=0, top_probability=0.95):
176
+ files=[files]
177
+ llm = Llama(
178
+ model_path="models/Meta-Llama-3-8B-Instruct.Q8_0.gguf",
179
+ flash_attn=True,
180
+ n_gpu_layers=81,
181
+ n_batch=1024,
182
+ n_ctx=8192,
183
+ )
184
+ for file in files:
185
+ if not file:
186
+ return 'Please provide a valid PDF'
187
+ file_name = os.path.basename(file)
188
+ file_name_splt = file_name.split('.')
189
+ if (len(file_name_splt) > 1 and file_name_splt[1] == 'pdf'):
190
+ page2content = process_document(file)
191
+ anonymized_text = ''
192
+ for page_id in page2content:
193
+ pdftext = page2content[page_id]
194
+ anonymized_text += deidentify_doc(llm, pdftext, maxtokens, temperature, top_probability)
195
+ anonymized_text += '\n\n\n'
196
+ return anonymized_text
197
+
198
+ css = ".gradio-container {background: 'logo.png'}"
199
+ temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value")
200
+ prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
201
+ max_tokens = gr.Number(value=600, label="Max Tokens")
202
+ input_folder = gr.File(file_count='multiple')
203
+ input_folder_text = gr.Textbox(label='Enter output folder path')
204
+ output_text = gr.Textbox()
205
+ output_path_component = gr.File(label="Select Output Path")
206
+ iface = gr.Interface(
207
+ fn=pdf_to_text,
208
+ inputs=['file'],
209
+ outputs="text",
210
+ title='Histofy EndoDeID (Endoscopy Report De-Identification)',
211
+ description="This application assists to remove personal information from the uploaded clinical report",
212
+ theme=gr.themes.Soft(),
213
+ )
214
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ accelerate
3
+ pdfminer.six
4
+ tqdm
5
+ huggingface_hub==0.22.2
6
+ scikit-build-core
7
+ https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.76-cu124/llama_cpp_python-0.2.76-cp310-cp310-linux_x86_64.whl
8
+ llama-cpp-agent>=0.2.10
9
+ vllm