oceansweep commited on
Commit
ee6aa85
1 Parent(s): 8619cce

Upload Utils.py

Browse files
Files changed (1) hide show
  1. App_Function_Libraries/Utils/Utils.py +91 -18
App_Function_Libraries/Utils/Utils.py CHANGED
@@ -20,13 +20,16 @@
20
  ####################
21
  #
22
  # Import necessary libraries
 
23
  import configparser
24
  import hashlib
25
  import json
26
  import logging
27
  import os
28
  import re
 
29
  import time
 
30
  from datetime import timedelta
31
  from typing import Union, AnyStr
32
  from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
@@ -40,24 +43,27 @@ from tqdm import tqdm
40
  #
41
  # Function Definitions
42
 
43
- def extract_text_from_segments(segments):
44
  logging.debug(f"Segments received: {segments}")
45
  logging.debug(f"Type of segments: {type(segments)}")
46
 
47
- def extract_text_recursive(data):
48
  if isinstance(data, dict):
 
 
 
49
  for key, value in data.items():
50
  if key == 'Text':
51
  return value
52
  elif isinstance(value, (dict, list)):
53
- result = extract_text_recursive(value)
54
  if result:
55
  return result
56
  elif isinstance(data, list):
57
- return ' '.join(filter(None, [extract_text_recursive(item) for item in data]))
58
  return None
59
 
60
- text = extract_text_recursive(segments)
61
 
62
  if text:
63
  return text.strip()
@@ -367,7 +373,9 @@ def format_metadata_as_text(metadata):
367
  else:
368
  formatted_value = str(value)
369
 
370
- formatted_text += f"{key.capitalize()}: {formatted_value}\n"
 
 
371
  return formatted_text.strip()
372
 
373
  # # Example usage:
@@ -494,7 +502,7 @@ def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5
494
  def create_download_directory(title):
495
  base_dir = "Results"
496
  # Remove characters that are illegal in Windows filenames and normalize
497
- safe_title = normalize_title(title)
498
  logging.debug(f"{title} successfully normalized")
499
  session_path = os.path.join(base_dir, safe_title)
500
  if not os.path.exists(session_path):
@@ -507,16 +515,28 @@ def create_download_directory(title):
507
 
508
  def safe_read_file(file_path):
509
  encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
  for encoding in encodings:
511
  try:
512
- with open(file_path, 'r', encoding=encoding) as file:
513
- return file.read()
 
514
  except UnicodeDecodeError:
515
  continue
516
- except FileNotFoundError:
517
- return f"File not found: {file_path}"
518
- except Exception as e:
519
- return f"An error occurred: {e}"
520
  return f"Unable to decode the file {file_path} with any of the attempted encodings: {encodings}"
521
 
522
  #
@@ -591,13 +611,27 @@ def verify_checksum(file_path, expected_checksum):
591
  return sha256_hash.hexdigest() == expected_checksum
592
 
593
 
594
- def normalize_title(title):
595
  # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
596
  title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
597
- title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
598
- '').replace(
599
- '<', '').replace('>', '').replace('|', '')
600
- return title
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
 
602
 
603
  def clean_youtube_url(url):
@@ -640,6 +674,20 @@ def format_transcription(content):
640
 
641
  return formatted_content
642
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
 
644
  def format_file_path(file_path, fallback_path=None):
645
  if file_path and os.path.exists(file_path):
@@ -696,7 +744,32 @@ def format_text_with_line_breaks(text):
696
  #
697
  # File Handling Functions
698
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699
 
 
 
700
 
701
  #
702
  # End of File Handling Functions
 
20
  ####################
21
  #
22
  # Import necessary libraries
23
+ import chardet
24
  import configparser
25
  import hashlib
26
  import json
27
  import logging
28
  import os
29
  import re
30
+ import tempfile
31
  import time
32
+ import uuid
33
  from datetime import timedelta
34
  from typing import Union, AnyStr
35
  from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
 
43
  #
44
  # Function Definitions
45
 
46
+ def extract_text_from_segments(segments, include_timestamps=True):
47
  logging.debug(f"Segments received: {segments}")
48
  logging.debug(f"Type of segments: {type(segments)}")
49
 
50
+ def extract_text_recursive(data, include_timestamps):
51
  if isinstance(data, dict):
52
+ text = data.get('Text', '')
53
+ if include_timestamps and 'Time_Start' in data and 'Time_End' in data:
54
+ return f"{data['Time_Start']:.2f}s - {data['Time_End']:.2f}s | {text}"
55
  for key, value in data.items():
56
  if key == 'Text':
57
  return value
58
  elif isinstance(value, (dict, list)):
59
+ result = extract_text_recursive(value, include_timestamps)
60
  if result:
61
  return result
62
  elif isinstance(data, list):
63
+ return '\n'.join(filter(None, [extract_text_recursive(item, include_timestamps) for item in data]))
64
  return None
65
 
66
+ text = extract_text_recursive(segments, include_timestamps)
67
 
68
  if text:
69
  return text.strip()
 
373
  else:
374
  formatted_value = str(value)
375
 
376
+ # Replace underscores with spaces in the key name
377
+ formatted_key = key.replace('_', ' ').capitalize()
378
+ formatted_text += f"{formatted_key}: {formatted_value}\n"
379
  return formatted_text.strip()
380
 
381
  # # Example usage:
 
502
  def create_download_directory(title):
503
  base_dir = "Results"
504
  # Remove characters that are illegal in Windows filenames and normalize
505
+ safe_title = normalize_title(title, preserve_spaces=False)
506
  logging.debug(f"{title} successfully normalized")
507
  session_path = os.path.join(base_dir, safe_title)
508
  if not os.path.exists(session_path):
 
515
 
516
  def safe_read_file(file_path):
517
  encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252']
518
+
519
+ try:
520
+ with open(file_path, 'rb') as file:
521
+ raw_data = file.read()
522
+ except FileNotFoundError:
523
+ return f"File not found: {file_path}"
524
+ except Exception as e:
525
+ return f"An error occurred while reading the file: {e}"
526
+
527
+ # Use chardet to detect the encoding
528
+ detected = chardet.detect(raw_data)
529
+ if detected['encoding'] is not None:
530
+ encodings.insert(0, detected['encoding'])
531
+
532
  for encoding in encodings:
533
  try:
534
+ decoded_content = raw_data.decode(encoding)
535
+ if decoded_content.isprintable():
536
+ return decoded_content
537
  except UnicodeDecodeError:
538
  continue
539
+
 
 
 
540
  return f"Unable to decode the file {file_path} with any of the attempted encodings: {encodings}"
541
 
542
  #
 
611
  return sha256_hash.hexdigest() == expected_checksum
612
 
613
 
614
+ def normalize_title(title, preserve_spaces=False):
615
  # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
616
  title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
617
+
618
+ if preserve_spaces:
619
+ # Replace special characters with underscores, but keep spaces
620
+ title = re.sub(r'[^\w\s\-.]', '_', title)
621
+ else:
622
+ # Replace special characters and spaces with underscores
623
+ title = re.sub(r'[^\w\-.]', '_', title)
624
+
625
+ # Replace multiple consecutive underscores with a single underscore
626
+ title = re.sub(r'_+', '_', title)
627
+
628
+ # Replace specific characters with underscores
629
+ title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '_').replace('*', '_').replace(
630
+ '?', '_').replace(
631
+ '<', '_').replace('>', '_').replace('|', '_')
632
+
633
+ return title.strip('_')
634
+
635
 
636
 
637
  def clean_youtube_url(url):
 
674
 
675
  return formatted_content
676
 
677
+ def sanitize_user_input(message):
678
+ """
679
+ Removes or escapes '{{' and '}}' to prevent placeholder injection.
680
+
681
+ Args:
682
+ message (str): The user's message.
683
+
684
+ Returns:
685
+ str: Sanitized message.
686
+ """
687
+ # Replace '{{' and '}}' with their escaped versions
688
+ message = re.sub(r'\{\{', '{ {', message)
689
+ message = re.sub(r'\}\}', '} }', message)
690
+ return message
691
 
692
  def format_file_path(file_path, fallback_path=None):
693
  if file_path and os.path.exists(file_path):
 
744
  #
745
  # File Handling Functions
746
 
747
+ # Track temp files for cleanup
748
+ temp_files = []
749
+ temp_file_paths = []
750
+
751
+ def save_temp_file(file):
752
+ global temp_files
753
+ temp_dir = tempfile.gettempdir()
754
+ temp_path = os.path.join(temp_dir, file.name)
755
+ with open(temp_path, 'wb') as f:
756
+ f.write(file.read())
757
+ temp_files.append(temp_path)
758
+ return temp_path
759
+
760
+ def cleanup_temp_files():
761
+ global temp_files
762
+ for file_path in temp_files:
763
+ if os.path.exists(file_path):
764
+ try:
765
+ os.remove(file_path)
766
+ logging.info(f"Removed temporary file: {file_path}")
767
+ except Exception as e:
768
+ logging.error(f"Failed to remove temporary file {file_path}: {e}")
769
+ temp_files.clear()
770
 
771
+ def generate_unique_id():
772
+ return f"uploaded_file_{uuid.uuid4()}"
773
 
774
  #
775
  # End of File Handling Functions