CurioChen commited on
Commit
ad72ad9
1 Parent(s): edc1804

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +247 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import requests
4
+ import datetime
5
+ import hashlib
6
+ import hmac
7
+ import logging
8
+ import ntplib
9
+ import time
10
+ import os
11
+ import tempfile
12
+ import io
13
+ from openai import OpenAI
14
+ from openpyxl import Workbook
15
+ import gradio as gr
16
+ import re
17
+ import fitz # PyMuPDF
18
+ import pandas as pd
19
+ from gradio_pdf import PDF # Import the new PDF component
20
+
21
+ # Configure logging
22
+ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
23
+
24
+ # Get configuration from environment variables
25
+ SECRET_ID = os.getenv("SECRET_ID", "AKID9EGD5tdKtpq5V1pkfbkwcJLOLEFVnJwp")
26
+ SECRET_KEY = os.getenv("SECRET_KEY", "374ugKueFkK7DFA62675Gk9TizCGA49A")
27
+ REGION = os.getenv("REGION", "ap-guangzhou")
28
+ ENDPOINT = os.getenv("ENDPOINT", "lke.tencentcloudapi.com")
29
+ SERVICE = "lke"
30
+ ACTION = "ReconstructDocument"
31
+ VERSION = "2023-11-30"
32
+
33
+ # OpenAI API key
34
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY",
35
+ "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")
36
+
37
+
38
+ # Get NTP time
39
+ def get_ntp_time():
40
+ ntp_client = ntplib.NTPClient()
41
+ try:
42
+ response = ntp_client.request('pool.ntp.org', version=3, timeout=5)
43
+ return datetime.datetime.fromtimestamp(response.tx_time, datetime.timezone.utc)
44
+ except Exception as e:
45
+ logging.warning(f"Unable to get NTP time, using local time: {e}")
46
+ return datetime.datetime.now(datetime.timezone.utc)
47
+
48
+
49
+ # Signing function
50
+ def sign(key, msg):
51
+ return hmac.new(key, msg.encode("utf-8"), hashlib.sha256).digest()
52
+
53
+
54
+ # Get authentication information
55
+ def get_auth(secret_id, secret_key, host, method, params, headers):
56
+ algorithm = "TC3-HMAC-SHA256"
57
+ ntp_time = get_ntp_time()
58
+ timestamp = int(ntp_time.timestamp())
59
+ date = ntp_time.strftime('%Y-%m-%d')
60
+
61
+ http_request_method = method.upper()
62
+ canonical_uri = "/"
63
+ canonical_querystring = ""
64
+ ct = headers.get("content-type", "application/x-www-form-urlencoded")
65
+ payload = json.dumps(params)
66
+ canonical_headers = f"content-type:{ct}\nhost:{host}\n"
67
+ signed_headers = "content-type;host"
68
+ hashed_request_payload = hashlib.sha256(payload.encode("utf-8")).hexdigest()
69
+ canonical_request = (f"{http_request_method}\n{canonical_uri}\n{canonical_querystring}\n"
70
+ f"{canonical_headers}\n{signed_headers}\n{hashed_request_payload}")
71
+
72
+ credential_scope = f"{date}/{SERVICE}/tc3_request"
73
+ hashed_canonical_request = hashlib.sha256(canonical_request.encode("utf-8")).hexdigest()
74
+ string_to_sign = (f"{algorithm}\n{timestamp}\n{credential_scope}\n{hashed_canonical_request}")
75
+
76
+ secret_date = sign(f"TC3{secret_key}".encode("utf-8"), date)
77
+ secret_service = sign(secret_date, SERVICE)
78
+ secret_signing = sign(secret_service, "tc3_request")
79
+ signature = hmac.new(secret_signing, string_to_sign.encode("utf-8"), hashlib.sha256).hexdigest()
80
+
81
+ authorization = (f"{algorithm} Credential={secret_id}/{credential_scope}, "
82
+ f"SignedHeaders={signed_headers}, Signature={signature}")
83
+
84
+ return {
85
+ "Authorization": authorization,
86
+ "Host": host,
87
+ "Content-Type": ct,
88
+ "X-TC-Timestamp": str(timestamp),
89
+ "X-TC-Version": VERSION,
90
+ "X-TC-Action": ACTION,
91
+ "X-TC-Region": REGION
92
+ }
93
+
94
+
95
+ # Extract information
96
+ def extract_information(content):
97
+ client = OpenAI(api_key=OPENAI_API_KEY)
98
+
99
+ prompt = (
100
+ "There are some guides, respond in detailed content, respond without content in (), JSON begin with contracts value:\n"
101
+ "1. Contract awarded date\n"
102
+ "2. Construction location (This part of the content is in the title, not in the table; the address must be returned and should be detailed.)\n"
103
+ "3. Tender reference\n"
104
+ "4. Construction summary (in the 'particular' section)\n"
105
+ "5. Contractor\n"
106
+ "6. Contractor address(this is not company name, the address must be returned and should be detailed.)\n"
107
+ "7. Amount\n"
108
+ "8. Notice publish date (at the end of the content)"
109
+ )
110
+
111
+ for attempt in range(3): # Try three times
112
+ try:
113
+ logging.info(f"Extracting information (Attempt {attempt + 1}/3)")
114
+ response = client.chat.completions.create(
115
+ model="gpt-4o",
116
+ messages=[
117
+ {"role": "system", "content": "You are a helpful assistant designed to output JSON"},
118
+ {"role": "user", "content": f"{prompt}\n\n{content}"}
119
+ ],
120
+ response_format={"type": "json_object"}
121
+ )
122
+
123
+ if response.choices[0].finish_reason == "stop":
124
+ extracted_info = json.loads(response.choices[0].message.content)
125
+ return json.dumps(extracted_info, ensure_ascii=False, indent=4)
126
+ else:
127
+ logging.warning(f"Warning: Unexpected completion reason - {response.choices[0].finish_reason}")
128
+ except Exception as e:
129
+ logging.error(f"Error: API call failed - {str(e)}")
130
+
131
+ if attempt < 2: # If not the last attempt, wait before retrying
132
+ time.sleep(5)
133
+
134
+ return None # If all three attempts fail, return None.
135
+
136
+
137
+ # JSON to Excel
138
+ def json_to_excel(json_data):
139
+ data = json.loads(json_data)
140
+
141
+ wb = Workbook()
142
+ ws = wb.active
143
+
144
+ headers = ['contract_awarded_date', 'construction_location', 'tender_reference',
145
+ 'construction_summary', 'contractor', 'contractor_address',
146
+ 'amount', 'notice_publish_date']
147
+ ws.append(headers)
148
+
149
+ # Create a helper function for exact matching
150
+ def exact_match(key, target):
151
+ key = ''.join(c.lower() for c in key if c.isalnum())
152
+ target = ''.join(c.lower() for c in target if c.isalnum())
153
+ return key == target
154
+
155
+ for contract in data['contracts']:
156
+ row = []
157
+ for header in headers:
158
+ # Use exact matching to find the corresponding value
159
+ matched_value = next((v for k, v in contract.items() if exact_match(header, k)), '')
160
+ row.append(matched_value)
161
+ ws.append(row)
162
+
163
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
164
+ wb.save(tmp.name)
165
+ return tmp.name
166
+
167
+ def clean_url(input_text):
168
+ # Remove any leading or trailing quotes
169
+ cleaned_url = input_text.strip().strip('"')
170
+ return cleaned_url
171
+
172
+ # New function: Process uploaded PDF
173
+ def process_pdf(file):
174
+ logging.info(f"Start processing PDF file: {type(file)}")
175
+ try:
176
+ if hasattr(file, 'name'):
177
+ # If file is a file object
178
+ with fitz.open(file.name) as doc:
179
+ text_content = ""
180
+ for page in doc:
181
+ text_content += page.get_text()
182
+ else:
183
+ # If file is a string (file path)
184
+ with fitz.open(file) as doc:
185
+ text_content = ""
186
+ for page in doc:
187
+ text_content += page.get_text()
188
+ logging.info("PDF processing successful")
189
+ return text_content
190
+ except Exception as e:
191
+ logging.error(f"PDF processing error: {str(e)}")
192
+ raise
193
+
194
+ def preview_excel(excel_path):
195
+ try:
196
+ df = pd.read_excel(excel_path, nrows=10)
197
+ preview_df = df.iloc[:10, :8]
198
+ return gr.Dataframe(value=preview_df)
199
+ except Exception as e:
200
+ logging.error(f"Excel preview error: {str(e)}")
201
+ return gr.Dataframe()
202
+
203
+ def process_pdf_file(file):
204
+ if file is None:
205
+ logging.warning("No file uploaded")
206
+ return "Please upload a PDF file.", None, gr.Dataframe()
207
+
208
+ try:
209
+ logging.info(f"Received file: {type(file)}, {file.name if hasattr(file, 'name') else 'No name'}")
210
+ pdf_content = process_pdf(file)
211
+ except Exception as e:
212
+ logging.error(f"Error processing PDF file: {str(e)}", exc_info=True)
213
+ return f"Error processing PDF file: {str(e)}", None, gr.Dataframe()
214
+
215
+ try:
216
+ json_data = extract_information(pdf_content)
217
+ if json_data is None:
218
+ logging.error("Failed to extract information")
219
+ return "Error extracting information. Please try again later.", None, gr.Dataframe()
220
+
221
+ excel_path = json_to_excel(json_data)
222
+ excel_preview = preview_excel(excel_path)
223
+
224
+ logging.info("File processing successful")
225
+ return "Processing successful!", excel_path, excel_preview
226
+ except Exception as e:
227
+ logging.error(f"Error processing file: {str(e)}", exc_info=True)
228
+ return f"Error processing file: {str(e)}", None, gr.Dataframe()
229
+
230
+ # Gradio interface
231
+ iface = gr.Interface(
232
+ fn=process_pdf_file,
233
+ inputs=[
234
+ PDF(label="Upload PDF File") # Only keep the label parameter
235
+ ],
236
+ outputs=[
237
+ gr.Textbox(label="Processing Status"),
238
+ gr.File(label="Download Excel File"),
239
+ gr.Dataframe(label="Excel Preview (First 10 rows, 8 columns)")
240
+ ],
241
+ title="PDF Document Processing and Information Extraction",
242
+ description="Upload a PDF file, and the system will process it and generate an Excel result."
243
+ )
244
+
245
+ # Run the Gradio app
246
+ if __name__ == "__main__":
247
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ openai
2
+ openpyxl
3
+ gradio
4
+ gradio_pdf
5
+ PyMuPDF
6
+ pandas
7
+ ntplib
8
+ requests