oceansweep
commited on
Commit
•
ee6aa85
1
Parent(s):
8619cce
Upload Utils.py
Browse files
App_Function_Libraries/Utils/Utils.py
CHANGED
@@ -20,13 +20,16 @@
|
|
20 |
####################
|
21 |
#
|
22 |
# Import necessary libraries
|
|
|
23 |
import configparser
|
24 |
import hashlib
|
25 |
import json
|
26 |
import logging
|
27 |
import os
|
28 |
import re
|
|
|
29 |
import time
|
|
|
30 |
from datetime import timedelta
|
31 |
from typing import Union, AnyStr
|
32 |
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
|
@@ -40,24 +43,27 @@ from tqdm import tqdm
|
|
40 |
#
|
41 |
# Function Definitions
|
42 |
|
43 |
-
def extract_text_from_segments(segments):
|
44 |
logging.debug(f"Segments received: {segments}")
|
45 |
logging.debug(f"Type of segments: {type(segments)}")
|
46 |
|
47 |
-
def extract_text_recursive(data):
|
48 |
if isinstance(data, dict):
|
|
|
|
|
|
|
49 |
for key, value in data.items():
|
50 |
if key == 'Text':
|
51 |
return value
|
52 |
elif isinstance(value, (dict, list)):
|
53 |
-
result = extract_text_recursive(value)
|
54 |
if result:
|
55 |
return result
|
56 |
elif isinstance(data, list):
|
57 |
-
return '
|
58 |
return None
|
59 |
|
60 |
-
text = extract_text_recursive(segments)
|
61 |
|
62 |
if text:
|
63 |
return text.strip()
|
@@ -367,7 +373,9 @@ def format_metadata_as_text(metadata):
|
|
367 |
else:
|
368 |
formatted_value = str(value)
|
369 |
|
370 |
-
|
|
|
|
|
371 |
return formatted_text.strip()
|
372 |
|
373 |
# # Example usage:
|
@@ -494,7 +502,7 @@ def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5
|
|
494 |
def create_download_directory(title):
|
495 |
base_dir = "Results"
|
496 |
# Remove characters that are illegal in Windows filenames and normalize
|
497 |
-
safe_title = normalize_title(title)
|
498 |
logging.debug(f"{title} successfully normalized")
|
499 |
session_path = os.path.join(base_dir, safe_title)
|
500 |
if not os.path.exists(session_path):
|
@@ -507,16 +515,28 @@ def create_download_directory(title):
|
|
507 |
|
508 |
def safe_read_file(file_path):
|
509 |
encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
510 |
for encoding in encodings:
|
511 |
try:
|
512 |
-
|
513 |
-
|
|
|
514 |
except UnicodeDecodeError:
|
515 |
continue
|
516 |
-
|
517 |
-
return f"File not found: {file_path}"
|
518 |
-
except Exception as e:
|
519 |
-
return f"An error occurred: {e}"
|
520 |
return f"Unable to decode the file {file_path} with any of the attempted encodings: {encodings}"
|
521 |
|
522 |
#
|
@@ -591,13 +611,27 @@ def verify_checksum(file_path, expected_checksum):
|
|
591 |
return sha256_hash.hexdigest() == expected_checksum
|
592 |
|
593 |
|
594 |
-
def normalize_title(title):
|
595 |
# Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
|
596 |
title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
601 |
|
602 |
|
603 |
def clean_youtube_url(url):
|
@@ -640,6 +674,20 @@ def format_transcription(content):
|
|
640 |
|
641 |
return formatted_content
|
642 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
643 |
|
644 |
def format_file_path(file_path, fallback_path=None):
|
645 |
if file_path and os.path.exists(file_path):
|
@@ -696,7 +744,32 @@ def format_text_with_line_breaks(text):
|
|
696 |
#
|
697 |
# File Handling Functions
|
698 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
699 |
|
|
|
|
|
700 |
|
701 |
#
|
702 |
# End of File Handling Functions
|
|
|
20 |
####################
|
21 |
#
|
22 |
# Import necessary libraries
|
23 |
+
import chardet
|
24 |
import configparser
|
25 |
import hashlib
|
26 |
import json
|
27 |
import logging
|
28 |
import os
|
29 |
import re
|
30 |
+
import tempfile
|
31 |
import time
|
32 |
+
import uuid
|
33 |
from datetime import timedelta
|
34 |
from typing import Union, AnyStr
|
35 |
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
|
|
|
43 |
#
|
44 |
# Function Definitions
|
45 |
|
46 |
+
def extract_text_from_segments(segments, include_timestamps=True):
|
47 |
logging.debug(f"Segments received: {segments}")
|
48 |
logging.debug(f"Type of segments: {type(segments)}")
|
49 |
|
50 |
+
def extract_text_recursive(data, include_timestamps):
|
51 |
if isinstance(data, dict):
|
52 |
+
text = data.get('Text', '')
|
53 |
+
if include_timestamps and 'Time_Start' in data and 'Time_End' in data:
|
54 |
+
return f"{data['Time_Start']:.2f}s - {data['Time_End']:.2f}s | {text}"
|
55 |
for key, value in data.items():
|
56 |
if key == 'Text':
|
57 |
return value
|
58 |
elif isinstance(value, (dict, list)):
|
59 |
+
result = extract_text_recursive(value, include_timestamps)
|
60 |
if result:
|
61 |
return result
|
62 |
elif isinstance(data, list):
|
63 |
+
return '\n'.join(filter(None, [extract_text_recursive(item, include_timestamps) for item in data]))
|
64 |
return None
|
65 |
|
66 |
+
text = extract_text_recursive(segments, include_timestamps)
|
67 |
|
68 |
if text:
|
69 |
return text.strip()
|
|
|
373 |
else:
|
374 |
formatted_value = str(value)
|
375 |
|
376 |
+
# Replace underscores with spaces in the key name
|
377 |
+
formatted_key = key.replace('_', ' ').capitalize()
|
378 |
+
formatted_text += f"{formatted_key}: {formatted_value}\n"
|
379 |
return formatted_text.strip()
|
380 |
|
381 |
# # Example usage:
|
|
|
502 |
def create_download_directory(title):
|
503 |
base_dir = "Results"
|
504 |
# Remove characters that are illegal in Windows filenames and normalize
|
505 |
+
safe_title = normalize_title(title, preserve_spaces=False)
|
506 |
logging.debug(f"{title} successfully normalized")
|
507 |
session_path = os.path.join(base_dir, safe_title)
|
508 |
if not os.path.exists(session_path):
|
|
|
515 |
|
516 |
def safe_read_file(file_path):
|
517 |
encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252']
|
518 |
+
|
519 |
+
try:
|
520 |
+
with open(file_path, 'rb') as file:
|
521 |
+
raw_data = file.read()
|
522 |
+
except FileNotFoundError:
|
523 |
+
return f"File not found: {file_path}"
|
524 |
+
except Exception as e:
|
525 |
+
return f"An error occurred while reading the file: {e}"
|
526 |
+
|
527 |
+
# Use chardet to detect the encoding
|
528 |
+
detected = chardet.detect(raw_data)
|
529 |
+
if detected['encoding'] is not None:
|
530 |
+
encodings.insert(0, detected['encoding'])
|
531 |
+
|
532 |
for encoding in encodings:
|
533 |
try:
|
534 |
+
decoded_content = raw_data.decode(encoding)
|
535 |
+
if decoded_content.isprintable():
|
536 |
+
return decoded_content
|
537 |
except UnicodeDecodeError:
|
538 |
continue
|
539 |
+
|
|
|
|
|
|
|
540 |
return f"Unable to decode the file {file_path} with any of the attempted encodings: {encodings}"
|
541 |
|
542 |
#
|
|
|
611 |
return sha256_hash.hexdigest() == expected_checksum
|
612 |
|
613 |
|
614 |
+
def normalize_title(title, preserve_spaces=False):
|
615 |
# Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
|
616 |
title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
|
617 |
+
|
618 |
+
if preserve_spaces:
|
619 |
+
# Replace special characters with underscores, but keep spaces
|
620 |
+
title = re.sub(r'[^\w\s\-.]', '_', title)
|
621 |
+
else:
|
622 |
+
# Replace special characters and spaces with underscores
|
623 |
+
title = re.sub(r'[^\w\-.]', '_', title)
|
624 |
+
|
625 |
+
# Replace multiple consecutive underscores with a single underscore
|
626 |
+
title = re.sub(r'_+', '_', title)
|
627 |
+
|
628 |
+
# Replace specific characters with underscores
|
629 |
+
title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '_').replace('*', '_').replace(
|
630 |
+
'?', '_').replace(
|
631 |
+
'<', '_').replace('>', '_').replace('|', '_')
|
632 |
+
|
633 |
+
return title.strip('_')
|
634 |
+
|
635 |
|
636 |
|
637 |
def clean_youtube_url(url):
|
|
|
674 |
|
675 |
return formatted_content
|
676 |
|
677 |
+
def sanitize_user_input(message):
|
678 |
+
"""
|
679 |
+
Removes or escapes '{{' and '}}' to prevent placeholder injection.
|
680 |
+
|
681 |
+
Args:
|
682 |
+
message (str): The user's message.
|
683 |
+
|
684 |
+
Returns:
|
685 |
+
str: Sanitized message.
|
686 |
+
"""
|
687 |
+
# Replace '{{' and '}}' with their escaped versions
|
688 |
+
message = re.sub(r'\{\{', '{ {', message)
|
689 |
+
message = re.sub(r'\}\}', '} }', message)
|
690 |
+
return message
|
691 |
|
692 |
def format_file_path(file_path, fallback_path=None):
|
693 |
if file_path and os.path.exists(file_path):
|
|
|
744 |
#
|
745 |
# File Handling Functions
|
746 |
|
747 |
+
# Track temp files for cleanup
|
748 |
+
temp_files = []
|
749 |
+
temp_file_paths = []
|
750 |
+
|
751 |
+
def save_temp_file(file):
|
752 |
+
global temp_files
|
753 |
+
temp_dir = tempfile.gettempdir()
|
754 |
+
temp_path = os.path.join(temp_dir, file.name)
|
755 |
+
with open(temp_path, 'wb') as f:
|
756 |
+
f.write(file.read())
|
757 |
+
temp_files.append(temp_path)
|
758 |
+
return temp_path
|
759 |
+
|
760 |
+
def cleanup_temp_files():
|
761 |
+
global temp_files
|
762 |
+
for file_path in temp_files:
|
763 |
+
if os.path.exists(file_path):
|
764 |
+
try:
|
765 |
+
os.remove(file_path)
|
766 |
+
logging.info(f"Removed temporary file: {file_path}")
|
767 |
+
except Exception as e:
|
768 |
+
logging.error(f"Failed to remove temporary file {file_path}: {e}")
|
769 |
+
temp_files.clear()
|
770 |
|
771 |
+
def generate_unique_id():
|
772 |
+
return f"uploaded_file_{uuid.uuid4()}"
|
773 |
|
774 |
#
|
775 |
# End of File Handling Functions
|