|
|
|
|
|
|
|
import argparse |
|
import logging |
|
import os |
|
import pathlib |
|
import re |
|
import sys |
|
from multiprocessing.dummy import Pool |
|
from typing import NamedTuple, Optional, Tuple |
|
|
|
import requests |
|
from mmcv.utils import get_logger |
|
|
|
|
|
def parse_args(): |
|
parser = argparse.ArgumentParser( |
|
description='Goes through all the inline-links ' |
|
'in markdown files and reports the breakages') |
|
parser.add_argument( |
|
'--num-threads', |
|
type=int, |
|
default=100, |
|
help='Number of processes to confirm the link') |
|
parser.add_argument('--https-proxy', type=str, help='https proxy') |
|
parser.add_argument( |
|
'--out', |
|
type=str, |
|
default='link_reports.txt', |
|
help='output path of reports') |
|
args = parser.parse_args() |
|
return args |
|
|
|
|
|
OK_STATUS_CODES = ( |
|
200, |
|
401, |
|
403, |
|
405, |
|
|
|
|
|
406, |
|
) |
|
|
|
|
|
class MatchTuple(NamedTuple): |
|
source: str |
|
name: str |
|
link: str |
|
|
|
|
|
def check_link( |
|
match_tuple: MatchTuple, |
|
http_session: requests.Session, |
|
logger: logging = None) -> Tuple[MatchTuple, bool, Optional[str]]: |
|
reason: Optional[str] = None |
|
if match_tuple.link.startswith('http'): |
|
result_ok, reason = check_url(match_tuple, http_session) |
|
else: |
|
result_ok = check_path(match_tuple) |
|
if logger is None: |
|
print(f" {'β' if result_ok else 'β'} {match_tuple.link}") |
|
else: |
|
logger.info(f" {'β' if result_ok else 'β'} {match_tuple.link}") |
|
return match_tuple, result_ok, reason |
|
|
|
|
|
def check_url(match_tuple: MatchTuple, |
|
http_session: requests.Session) -> Tuple[bool, str]: |
|
"""Check if a URL is reachable.""" |
|
try: |
|
result = http_session.head( |
|
match_tuple.link, timeout=5, allow_redirects=True) |
|
return ( |
|
result.ok or result.status_code in OK_STATUS_CODES, |
|
f'status code = {result.status_code}', |
|
) |
|
except (requests.ConnectionError, requests.Timeout): |
|
return False, 'connection error' |
|
|
|
|
|
def check_path(match_tuple: MatchTuple) -> bool: |
|
"""Check if a file in this repository exists.""" |
|
relative_path = match_tuple.link.split('#')[0] |
|
full_path = os.path.join( |
|
os.path.dirname(str(match_tuple.source)), relative_path) |
|
return os.path.exists(full_path) |
|
|
|
|
|
def main(): |
|
args = parse_args() |
|
|
|
|
|
logger = get_logger(name='mmdet', log_file=args.out) |
|
|
|
|
|
if args.https_proxy: |
|
os.environ['https_proxy'] = args.https_proxy |
|
|
|
|
|
http_session = requests.Session() |
|
for resource_prefix in ('http://', 'https://'): |
|
http_session.mount( |
|
resource_prefix, |
|
requests.adapters.HTTPAdapter( |
|
max_retries=5, |
|
pool_connections=20, |
|
pool_maxsize=args.num_threads), |
|
) |
|
|
|
logger.info('Finding all markdown files in the current directory...') |
|
|
|
project_root = (pathlib.Path(__file__).parent / '..').resolve() |
|
markdown_files = project_root.glob('**/*.md') |
|
|
|
all_matches = set() |
|
url_regex = re.compile(r'\[([^!][^\]]+)\]\(([^)(]+)\)') |
|
for markdown_file in markdown_files: |
|
with open(markdown_file) as handle: |
|
for line in handle.readlines(): |
|
matches = url_regex.findall(line) |
|
for name, link in matches: |
|
if 'localhost' not in link: |
|
all_matches.add( |
|
MatchTuple( |
|
source=str(markdown_file), |
|
name=name, |
|
link=link)) |
|
|
|
logger.info(f' {len(all_matches)} markdown files found') |
|
logger.info('Checking to make sure we can retrieve each link...') |
|
|
|
with Pool(processes=args.num_threads) as pool: |
|
results = pool.starmap(check_link, [(match, http_session, logger) |
|
for match in list(all_matches)]) |
|
|
|
|
|
unreachable_results = [(match_tuple, reason) |
|
for match_tuple, success, reason in results |
|
if not success] |
|
|
|
if unreachable_results: |
|
logger.info('================================================') |
|
logger.info(f'Unreachable links ({len(unreachable_results)}):') |
|
for match_tuple, reason in unreachable_results: |
|
logger.info(' > Source: ' + match_tuple.source) |
|
logger.info(' Name: ' + match_tuple.name) |
|
logger.info(' Link: ' + match_tuple.link) |
|
if reason is not None: |
|
logger.info(' Reason: ' + reason) |
|
sys.exit(1) |
|
logger.info('No Unreachable link found.') |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|