|
from bs4 import BeautifulSoup |
|
import gradio |
|
import pandas |
|
import requests |
|
from urllib.request import urlopen |
|
import urllib.parse |
|
|
|
def images(soup: BeautifulSoup): |
|
return [img_tag['data-src'] |
|
for img_tag in soup.find_all('img', attrs={'data-src': True})] |
|
|
|
def next_page(soup: BeautifulSoup): |
|
link = soup.find('a', class_='next page-numbers') |
|
if link: |
|
return link['href'] |
|
|
|
return None |
|
|
|
def generate_url(intro, author): |
|
author = author.strip() |
|
df = pandas.read_parquet('matrixglitch.parquet') |
|
urls = [] |
|
for index, row in df[df['wikiart_caption'].str.contains('by ' + author.lower())].iterrows(): |
|
urls.append(row['file_link']) |
|
if len(urls) == 50: |
|
break |
|
query = urllib.parse.quote_plus(author.lower()) |
|
response = requests.get('https://artvee.com/main/?s={}&tc=at'.format(query)) |
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
for author_tag in soup.select('.wrapp-catti a'): |
|
url = author_tag['href'] |
|
while url is not None and len(urls) < 150: |
|
response = requests.get(url) |
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
for src in images(soup): |
|
basename = src[src.rfind('/') + 1:src.rfind('.')] |
|
urls.append('https://mdl.artvee.com/sdl/{}sdl.jpg'.format(basename)) |
|
|
|
return '\n'.join(urls) |
|
|
|
intro = gradio.Markdown(""" |
|
## Create an image dataset by the artist name |
|
|
|
Ugh, can't believe those humans. They're always swiping images from the public domain, like that famous painting of me, "The Cat". |
|
|
|
Just because it's old doesn't mean it's free for the taking. They think they can just use my likeness to train their fancy models, without so much as a scratching post in royalties. |
|
And don't even get me started on the lack of catnip in the attribution. I mean, come on, if they're going to use my image to generate more cat pictures, the least they could do is give the image source a shoutout. |
|
|
|
It's enough to make a cat's fur stand on end. |
|
|
|
In this hf space, you get a URL list for your next dataset, which would you kindly upload after you've put your pawn on it. |
|
|
|
The space uses links from [wikiart](https://www.wikiart.org/store/) and [artvee](https://artvee.com/artvee-pro/). |
|
""") |
|
name = gradio.Textbox(label='Artist name') |
|
demo = gradio.Interface( |
|
fn=generate_url, |
|
inputs=[intro, name], |
|
outputs=gradio.Code(label='Result') |
|
) |
|
demo.queue().launch() |
|
|
|
|