|
import os |
|
import argparse |
|
import csv |
|
from time import sleep |
|
import time |
|
import json |
|
import numpy as np |
|
import fitz |
|
import pandas as pd |
|
import requests |
|
from src.retrieval.html2lines import url2lines, line_correction |
|
|
|
csv.field_size_limit(100000000) |
|
|
|
MAX_RETRIES = 3 |
|
TIMEOUT = 5 |
|
|
|
|
|
def scrape_text_from_url(url, temp_name): |
|
response = None |
|
for attempt in range(MAX_RETRIES): |
|
try: |
|
response = requests.get(url, timeout=TIMEOUT) |
|
except requests.RequestException as e: |
|
if attempt < MAX_RETRIES - 1: |
|
sleep(3) |
|
|
|
if ( |
|
response is None or response.status_code == 503 |
|
): |
|
return [] |
|
|
|
if url.endswith(".pdf"): |
|
with open(f"pdf_dir/{temp_name}.pdf", "wb") as f: |
|
f.write(response.content) |
|
|
|
extracted_text = "" |
|
doc = fitz.open(f"pdf_dir/{temp_name}.pdf") |
|
for page in doc: |
|
extracted_text += page.get_text() if page.get_text() else "" |
|
|
|
return line_correction(extracted_text.split("\n")) |
|
|
|
return line_correction(url2lines(url)) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
parser = argparse.ArgumentParser(description="Scraping text from URLs.") |
|
parser.add_argument( |
|
"-i", |
|
"--tsv_input_file", |
|
type=str, |
|
help="The path of the input files containing URLs from Google search.", |
|
) |
|
parser.add_argument( |
|
"-o", |
|
"--json_output_dir", |
|
type=str, |
|
default="output", |
|
help="The output JSON file to save the scraped data.", |
|
) |
|
parser.add_argument( |
|
"--overwrite_out_file", |
|
action="store_true", |
|
) |
|
|
|
args = parser.parse_args() |
|
|
|
assert ( |
|
os.path.splitext(args.tsv_input_file)[-1] == ".tsv" |
|
), "The input should be a tsv file." |
|
|
|
os.makedirs(args.json_output_dir, exist_ok=True) |
|
|
|
total_scraped, empty, total_failed = 0, 0, 0 |
|
|
|
print(f"Processing files {args.tsv_input_file}") |
|
|
|
st = time.time() |
|
|
|
claim_id = os.path.splitext(os.path.basename(args.tsv_input_file))[0] |
|
json_output_path = os.path.join(args.json_output_dir, f"{claim_id}.json") |
|
|
|
lines_skipped = 0 |
|
if os.path.exists(json_output_path): |
|
if args.overwrite_out_file: |
|
os.remove(json_output_path) |
|
else: |
|
with open(json_output_path, "r", encoding="utf-8") as json_file: |
|
existing_data = json_file.readlines() |
|
lines_skipped = len(existing_data) |
|
print(f" Skipping {lines_skipped} lines in {json_output_path}") |
|
|
|
|
|
try: |
|
df = pd.read_csv(args.tsv_input_file, sep="\t", header=None) |
|
data = df.values |
|
print("Data loaded successfully with Pandas.") |
|
|
|
except Exception as e: |
|
print("Error loading with csv:", e) |
|
try: |
|
data = np.genfromtxt( |
|
args.tsv_input_file, delimiter="\t", dtype=None, encoding=None |
|
) |
|
print("Data loaded successfully with NumPy.") |
|
except Exception as e: |
|
print("Error loading with NumPy:", e) |
|
try: |
|
data = [] |
|
with open(args.tsv_input_file, "r", newline="") as tsvfile: |
|
reader = csv.reader(tsvfile, delimiter="\t") |
|
for row in reader: |
|
data.append(row) |
|
print("Data loaded successfully with csv.") |
|
except Exception as e: |
|
print("Error loading with csv:", e) |
|
data = None |
|
|
|
if len(data) == lines_skipped: |
|
print(" No more lines need to be processed!") |
|
else: |
|
with open(json_output_path, "a", encoding="utf-8") as json_file: |
|
for index, row in enumerate(data): |
|
if index < lines_skipped: |
|
continue |
|
url = row[2] |
|
json_data = { |
|
"claim_id": claim_id, |
|
"type": row[1], |
|
"query": row[3], |
|
"url": url, |
|
"url2text": [], |
|
} |
|
print(f"Scraping text for url_{index}: {url}!") |
|
try: |
|
scrape_result = scrape_text_from_url(url, claim_id) |
|
json_data["url2text"] = scrape_result |
|
|
|
if len(json_data["url2text"]) > 0: |
|
total_scraped += 1 |
|
else: |
|
empty += 1 |
|
|
|
except Exception as e: |
|
total_failed += 1 |
|
|
|
json_file.write(json.dumps(json_data, ensure_ascii=False) + "\n") |
|
json_file.flush() |
|
|
|
print(f"Output for {args.tsv_input_file} saved to {json_output_path}") |
|
elapsed_time = time.time() - st |
|
elapsed_minutes = int(elapsed_time // 60) |
|
elapsed_seconds = int(elapsed_time % 60) |
|
print(f"Time elapsed: {elapsed_minutes}min {elapsed_seconds}sec") |
|
print(f"{total_scraped} scraped, {empty} empty, {total_failed} failed") |
|
|