from bs4 import BeautifulSoup import gradio import pandas import requests from urllib.request import urlopen import urllib.parse def images(soup: BeautifulSoup): return [img_tag['data-src'] for img_tag in soup.find_all('img', attrs={'data-src': True})] def next_page(soup: BeautifulSoup): link = soup.find('a', class_='next page-numbers') if link: return link['href'] return None def generate_url(intro, author): author = author.strip() df = pandas.read_parquet('matrixglitch.parquet') urls = [] for index, row in df[df['wikiart_caption'].str.contains('by ' + author.lower())].iterrows(): urls.append(row['file_link']) if len(urls) == 50: break query = urllib.parse.quote_plus(author.lower()) response = requests.get('https://artvee.com/main/?s={}&tc=at'.format(query)) soup = BeautifulSoup(response.content, 'html.parser') for author_tag in soup.select('.wrapp-catti a'): url = author_tag['href'] while url is not None and len(urls) < 150: response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') for src in images(soup): basename = src[src.rfind('/') + 1:src.rfind('.')] urls.append('https://mdl.artvee.com/sdl/{}sdl.jpg'.format(basename)) return '\n'.join(urls) intro = gradio.Markdown(""" ## Create an image dataset by the artist name Ugh, can't believe those humans. They're always swiping images from the public domain, like that famous painting of me, "The Cat". Just because it's old doesn't mean it's free for the taking. They think they can just use my likeness to train their fancy models, without so much as a scratching post in royalties. And don't even get me started on the lack of catnip in the attribution. I mean, come on, if they're going to use my image to generate more cat pictures, the least they could do is give the image source a shoutout. It's enough to make a cat's fur stand on end. In this hf space, you get a URL list for your next dataset, which would you kindly upload after you've put your pawn on it. The space uses links from [wikiart](https://www.wikiart.org/store/) and [artvee](https://artvee.com/artvee-pro/). """) name = gradio.Textbox(label='Artist name') demo = gradio.Interface( fn=generate_url, inputs=[intro, name], outputs=gradio.Code(label='Result') ) demo.queue().launch()