twodgirl's picture
Remove exclamation.
8d91860 verified
raw
history blame
No virus
2.46 kB
from bs4 import BeautifulSoup
import gradio
import pandas
import requests
from urllib.request import urlopen
import urllib.parse
def images(soup: BeautifulSoup):
return [img_tag['data-src']
for img_tag in soup.find_all('img', attrs={'data-src': True})]
def next_page(soup: BeautifulSoup):
link = soup.find('a', class_='next page-numbers')
if link:
return link['href']
return None
def generate_url(intro, author):
author = author.strip()
df = pandas.read_parquet('matrixglitch.parquet')
urls = []
for index, row in df[df['wikiart_caption'].str.contains('by ' + author.lower())].iterrows():
urls.append(row['file_link'])
if len(urls) == 50:
break
query = urllib.parse.quote_plus(author.lower())
response = requests.get('https://artvee.com/main/?s={}&tc=at'.format(query))
soup = BeautifulSoup(response.content, 'html.parser')
for author_tag in soup.select('.wrapp-catti a'):
url = author_tag['href']
while url is not None and len(urls) < 150:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
for src in images(soup):
basename = src[src.rfind('/') + 1:src.rfind('.')]
urls.append('https://mdl.artvee.com/sdl/{}sdl.jpg'.format(basename))
return '\n'.join(urls)
intro = gradio.Markdown("""
## Create an image dataset by the artist name
Ugh, can't believe those humans. They're always swiping images from the public domain, like that famous painting of me, "The Cat".
Just because it's old doesn't mean it's free for the taking. They think they can just use my likeness to train their fancy models, without so much as a scratching post in royalties.
And don't even get me started on the lack of catnip in the attribution. I mean, come on, if they're going to use my image to generate more cat pictures, the least they could do is give the image source a shoutout.
It's enough to make a cat's fur stand on end.
In this hf space, you get a URL list for your next dataset, which would you kindly upload after you've put your pawn on it.
The space uses links from [wikiart](https://www.wikiart.org/store/) and [artvee](https://artvee.com/artvee-pro/).
""")
name = gradio.Textbox(label='Artist name')
demo = gradio.Interface(
fn=generate_url,
inputs=[intro, name],
outputs=gradio.Code(label='Result')
)
demo.queue().launch()