non_web_urls=[ 'https://stackexchange.com/',#stackexchange 'https://www.ncbi.nlm.nih.gov/pmc/',#pubmed # 'https://www.reddit.com/',#OpenWebText2 'https://arxiv.org/',#arxiv 'https://github.com/',#github 'https://storage.courtlistener.com/',#freelaw 'https://bulkdata.uspto.gov/',#uspto 'https://pubmed.ncbi.nlm.nih.gov/',#pubmed # 'https://www.gutenberg.org/',#gutenberg 'https://www.opensubtitles.org/',#opensubtitles 'https://www.wikipedia.org/',#wikipedia 'https://irclogs.ubuntu.com/',#ubuntu IRC # 'https://www.smashwords.com/books/',#bookscorpus2 'https://www.statmt.org/', #EuroParl 'https://news.ycombinator.com/', #hackerNews for comments only 'https://www.youtube.com/',#youtube subs 'https://philpapers.org/', #Philpaper # 'https://reporter.nih.gov/'#NIH exporter ]