camenduru's picture
thanks to show ❀
3bbb319
# Modified from:
# https://github.com/allenai/allennlp/blob/main/scripts/check_links.py
import argparse
import logging
import os
import pathlib
import re
import sys
from multiprocessing.dummy import Pool
from typing import NamedTuple, Optional, Tuple
import requests
from mmcv.utils import get_logger
def parse_args():
parser = argparse.ArgumentParser(
description='Goes through all the inline-links '
'in markdown files and reports the breakages')
parser.add_argument(
'--num-threads',
type=int,
default=100,
help='Number of processes to confirm the link')
parser.add_argument('--https-proxy', type=str, help='https proxy')
parser.add_argument(
'--out',
type=str,
default='link_reports.txt',
help='output path of reports')
args = parser.parse_args()
return args
OK_STATUS_CODES = (
200,
401, # the resource exists but may require some sort of login.
403, # ^ same
405, # HEAD method not allowed.
# the resource exists, but our default 'Accept-' header may not
# match what the server can provide.
406,
)
class MatchTuple(NamedTuple):
source: str
name: str
link: str
def check_link(
match_tuple: MatchTuple,
http_session: requests.Session,
logger: logging = None) -> Tuple[MatchTuple, bool, Optional[str]]:
reason: Optional[str] = None
if match_tuple.link.startswith('http'):
result_ok, reason = check_url(match_tuple, http_session)
else:
result_ok = check_path(match_tuple)
if logger is None:
print(f" {'βœ“' if result_ok else 'βœ—'} {match_tuple.link}")
else:
logger.info(f" {'βœ“' if result_ok else 'βœ—'} {match_tuple.link}")
return match_tuple, result_ok, reason
def check_url(match_tuple: MatchTuple,
http_session: requests.Session) -> Tuple[bool, str]:
"""Check if a URL is reachable."""
try:
result = http_session.head(
match_tuple.link, timeout=5, allow_redirects=True)
return (
result.ok or result.status_code in OK_STATUS_CODES,
f'status code = {result.status_code}',
)
except (requests.ConnectionError, requests.Timeout):
return False, 'connection error'
def check_path(match_tuple: MatchTuple) -> bool:
"""Check if a file in this repository exists."""
relative_path = match_tuple.link.split('#')[0]
full_path = os.path.join(
os.path.dirname(str(match_tuple.source)), relative_path)
return os.path.exists(full_path)
def main():
args = parse_args()
# setup logger
logger = get_logger(name='mmdet', log_file=args.out)
# setup https_proxy
if args.https_proxy:
os.environ['https_proxy'] = args.https_proxy
# setup http_session
http_session = requests.Session()
for resource_prefix in ('http://', 'https://'):
http_session.mount(
resource_prefix,
requests.adapters.HTTPAdapter(
max_retries=5,
pool_connections=20,
pool_maxsize=args.num_threads),
)
logger.info('Finding all markdown files in the current directory...')
project_root = (pathlib.Path(__file__).parent / '..').resolve()
markdown_files = project_root.glob('**/*.md')
all_matches = set()
url_regex = re.compile(r'\[([^!][^\]]+)\]\(([^)(]+)\)')
for markdown_file in markdown_files:
with open(markdown_file) as handle:
for line in handle.readlines():
matches = url_regex.findall(line)
for name, link in matches:
if 'localhost' not in link:
all_matches.add(
MatchTuple(
source=str(markdown_file),
name=name,
link=link))
logger.info(f' {len(all_matches)} markdown files found')
logger.info('Checking to make sure we can retrieve each link...')
with Pool(processes=args.num_threads) as pool:
results = pool.starmap(check_link, [(match, http_session, logger)
for match in list(all_matches)])
# collect unreachable results
unreachable_results = [(match_tuple, reason)
for match_tuple, success, reason in results
if not success]
if unreachable_results:
logger.info('================================================')
logger.info(f'Unreachable links ({len(unreachable_results)}):')
for match_tuple, reason in unreachable_results:
logger.info(' > Source: ' + match_tuple.source)
logger.info(' Name: ' + match_tuple.name)
logger.info(' Link: ' + match_tuple.link)
if reason is not None:
logger.info(' Reason: ' + reason)
sys.exit(1)
logger.info('No Unreachable link found.')
if __name__ == '__main__':
main()