LittleApple-fp16 commited on
Commit
400590a
1 Parent(s): 01ac6b3

Upload 12 files

Browse files
danbooru_cawler/.gitignore ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ pics/
6
+ *.pyc
7
+
8
+ # C extensions
9
+ *.so
10
+
11
+ # Distribution / packaging
12
+ .Python
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ lib/
20
+ lib64/
21
+ parts/
22
+ sdist/
23
+ var/
24
+ wheels/
25
+ share/python-wheels/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+ MANIFEST
30
+
31
+ # PyInstaller
32
+ # Usually these files are written by a python script from a template
33
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
34
+ *.manifest
35
+ *.spec
36
+
37
+ # Installer logs
38
+ pip-log.txt
39
+ pip-delete-this-directory.txt
40
+
41
+ # Unit test / coverage reports
42
+ htmlcov/
43
+ .tox/
44
+ .nox/
45
+ .coverage
46
+ .coverage.*
47
+ .cache
48
+ nosetests.xml
49
+ coverage.xml
50
+ *.cover
51
+ *.py,cover
52
+ .hypothesis/
53
+ .pytest_cache/
54
+ cover/
55
+
56
+ # Translations
57
+ *.mo
58
+ *.pot
59
+
60
+ # Django stuff:
61
+ *.log
62
+ local_settings.py
63
+ db.sqlite3
64
+ db.sqlite3-journal
65
+
66
+ # Flask stuff:
67
+ instance/
68
+ .webassets-cache
69
+
70
+ # Scrapy stuff:
71
+ .scrapy
72
+
73
+ # Sphinx documentation
74
+ docs/_build/
75
+
76
+ # PyBuilder
77
+ .pybuilder/
78
+ target/
79
+
80
+ # Jupyter Notebook
81
+ .ipynb_checkpoints
82
+
83
+ # IPython
84
+ profile_default/
85
+ ipython_config.py
86
+
87
+ # pyenv
88
+ # For a library or package, you might want to ignore these files since the code is
89
+ # intended to run in multiple environments; otherwise, check them in:
90
+ # .python-version
91
+
92
+ # pipenv
93
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
95
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
96
+ # install all needed dependencies.
97
+ #Pipfile.lock
98
+
99
+ # poetry
100
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
102
+ # commonly ignored for libraries.
103
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104
+ #poetry.lock
105
+
106
+ # pdm
107
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108
+ #pdm.lock
109
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110
+ # in version control.
111
+ # https://pdm.fming.dev/#use-with-ide
112
+ .pdm.toml
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
danbooru_cawler/README.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # danbooru_cawler
2
+ 一个基于scrapy的danbooru图片抓取工具
3
+
4
+ ## 运行
5
+
6
+ 初次运行需要安装环境
7
+
8
+ ```
9
+ python -m venv venv
10
+ .\venv\Scripts\activate
11
+ python -m pip install --upgrade pip
12
+ pip install -r requirements.txt
13
+ ```
14
+
15
+ 以后每次运行只需要
16
+
17
+ ```
18
+ .\venv\Scripts\activate
19
+ python .\main.py
20
+ ```
21
+
22
+ ## 说明
23
+
24
+ 调整需要抓取的tag可以更改/danbooru_crawler/settings.py文件内的SEARCH_TAG值,多个tag用+相连
25
+
26
+ 图片将输出在/pics/full文件夹内
27
+
28
+ 如果运行一段时间后没有图片可能是Pillow版本过低,建议在虚拟环境下执行
29
+ ```
30
+ pip uninstall Pillow
31
+ pip install Pillow
32
+ ```
danbooru_cawler/danbooru_crawler/__init__.py ADDED
File without changes
danbooru_cawler/danbooru_crawler/items.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define here the models for your scraped items
2
+ #
3
+ # See documentation in:
4
+ # https://docs.scrapy.org/en/latest/topics/items.html
5
+
6
+ import scrapy
7
+
8
+
9
+ class DanbooruCrawlerItem(scrapy.Item):
10
+ # define the fields for your item here like:
11
+ # name = scrapy.Field()
12
+ pass
13
+
14
+
15
+ class ImagedownloadItem(scrapy.Item):
16
+ image_name = scrapy.Field()
17
+ image_urls = scrapy.Field()
18
+ images = scrapy.Field()
danbooru_cawler/danbooru_crawler/middlewares.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define here the models for your spider middleware
2
+ #
3
+ # See documentation in:
4
+ # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5
+
6
+ from scrapy import signals
7
+
8
+ # useful for handling different item types with a single interface
9
+ from itemadapter import is_item, ItemAdapter
10
+
11
+
12
+ class DanbooruCrawlerSpiderMiddleware:
13
+ # Not all methods need to be defined. If a method is not defined,
14
+ # scrapy acts as if the spider middleware does not modify the
15
+ # passed objects.
16
+
17
+ @classmethod
18
+ def from_crawler(cls, crawler):
19
+ # This method is used by Scrapy to create your spiders.
20
+ s = cls()
21
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
22
+ return s
23
+
24
+ def process_spider_input(self, response, spider):
25
+ # Called for each response that goes through the spider
26
+ # middleware and into the spider.
27
+
28
+ # Should return None or raise an exception.
29
+ return None
30
+
31
+ def process_spider_output(self, response, result, spider):
32
+ # Called with the results returned from the Spider, after
33
+ # it has processed the response.
34
+
35
+ # Must return an iterable of Request, or item objects.
36
+ for i in result:
37
+ yield i
38
+
39
+ def process_spider_exception(self, response, exception, spider):
40
+ # Called when a spider or process_spider_input() method
41
+ # (from other spider middleware) raises an exception.
42
+
43
+ # Should return either None or an iterable of Request or item objects.
44
+ pass
45
+
46
+ def process_start_requests(self, start_requests, spider):
47
+ # Called with the start requests of the spider, and works
48
+ # similarly to the process_spider_output() method, except
49
+ # that it doesn’t have a response associated.
50
+
51
+ # Must return only requests (not items).
52
+ for r in start_requests:
53
+ yield r
54
+
55
+ def spider_opened(self, spider):
56
+ spider.logger.info("Spider opened: %s" % spider.name)
57
+
58
+
59
+ class DanbooruCrawlerDownloaderMiddleware:
60
+ # Not all methods need to be defined. If a method is not defined,
61
+ # scrapy acts as if the downloader middleware does not modify the
62
+ # passed objects.
63
+
64
+ @classmethod
65
+ def from_crawler(cls, crawler):
66
+ # This method is used by Scrapy to create your spiders.
67
+ s = cls()
68
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69
+ return s
70
+
71
+ def process_request(self, request, spider):
72
+ # Called for each request that goes through the downloader
73
+ # middleware.
74
+
75
+ # Must either:
76
+ # - return None: continue processing this request
77
+ # - or return a Response object
78
+ # - or return a Request object
79
+ # - or raise IgnoreRequest: process_exception() methods of
80
+ # installed downloader middleware will be called
81
+ return None
82
+
83
+ def process_response(self, request, response, spider):
84
+ # Called with the response returned from the downloader.
85
+
86
+ # Must either;
87
+ # - return a Response object
88
+ # - return a Request object
89
+ # - or raise IgnoreRequest
90
+ return response
91
+
92
+ def process_exception(self, request, exception, spider):
93
+ # Called when a download handler or a process_request()
94
+ # (from other downloader middleware) raises an exception.
95
+
96
+ # Must either:
97
+ # - return None: continue processing this exception
98
+ # - return a Response object: stops process_exception() chain
99
+ # - return a Request object: stops process_exception() chain
100
+ pass
101
+
102
+ def spider_opened(self, spider):
103
+ spider.logger.info("Spider opened: %s" % spider.name)
danbooru_cawler/danbooru_crawler/pipelines.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define your item pipelines here
2
+ #
3
+ # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4
+ # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5
+
6
+
7
+ # useful for handling different item types with a single interface
8
+ import scrapy
9
+ from itemadapter import ItemAdapter
10
+ from scrapy.exceptions import DropItem
11
+ from scrapy.pipelines.images import ImagesPipeline
12
+
13
+
14
+ class DanbooruCrawlerPipeline:
15
+ def process_item(self, item, spider):
16
+ return item
17
+
18
+
19
+ class PicsDownloadPipeline(ImagesPipeline):
20
+ def file_path(self, request, response=None, info=None, *, item=None):
21
+ image_guid = item["image_name"]
22
+ return f"full/{image_guid}.jpg"
danbooru_cawler/danbooru_crawler/settings.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Scrapy settings for danbooru_crawler project
2
+ #
3
+ # For simplicity, this file contains only settings considered important or
4
+ # commonly used. You can find more settings consulting the documentation:
5
+ #
6
+ # https://docs.scrapy.org/en/latest/topics/settings.html
7
+ # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8
+ # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9
+
10
+ BOT_NAME = "danbooru_crawler"
11
+ SEARCH_TAG = "tsubasa_tsubasa"
12
+ SEARCH_TYPE = 1 # 0 / 1 是否抓取缩略图,0代表抓缩略图,1代表原图
13
+ SEARCH_LINK = False # True / False 连锁搜索(即搜索子图,训练画风可以打开,训练概念等会出现与tag特征不符的图导致污染训练集)
14
+
15
+ SPIDER_MODULES = ["danbooru_crawler.spiders"]
16
+ NEWSPIDER_MODULE = "danbooru_crawler.spiders"
17
+
18
+
19
+ # Crawl responsibly by identifying yourself (and your website) on the user-agent
20
+ # USER_AGENT = "danbooru_crawler (+http://www.yourdomain.com)"
21
+
22
+ # Obey robots.txt rules
23
+ ROBOTSTXT_OBEY = False
24
+
25
+ # Configure maximum concurrent requests performed by Scrapy (default: 16)
26
+ # CONCURRENT_REQUESTS = 32
27
+
28
+ # Configure a delay for requests for the same website (default: 0)
29
+ # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
30
+ # See also autothrottle settings and docs
31
+ # DOWNLOAD_DELAY = 3
32
+ # The download delay setting will honor only one of:
33
+ # CONCURRENT_REQUESTS_PER_DOMAIN = 16
34
+ # CONCURRENT_REQUESTS_PER_IP = 16
35
+
36
+ # Disable cookies (enabled by default)
37
+ # COOKIES_ENABLED = False
38
+
39
+ # Disable Telnet Console (enabled by default)
40
+ # TELNETCONSOLE_ENABLED = False
41
+
42
+ # Override the default request headers:
43
+ # DEFAULT_REQUEST_HEADERS = {
44
+ # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
45
+ # "Accept-Language": "en",
46
+ # }
47
+
48
+ # Enable or disable spider middlewares
49
+ # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
50
+ # SPIDER_MIDDLEWARES = {
51
+ # "danbooru_crawler.middlewares.DanbooruCrawlerSpiderMiddleware": 543,
52
+ # }
53
+
54
+ # Enable or disable downloader middlewares
55
+ # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
56
+ # DOWNLOADER_MIDDLEWARES = {
57
+ # "danbooru_crawler.middlewares.DanbooruCrawlerDownloaderMiddleware": 543,
58
+ # }
59
+
60
+ # Enable or disable extensions
61
+ # See https://docs.scrapy.org/en/latest/topics/extensions.html
62
+ # EXTENSIONS = {
63
+ # "scrapy.extensions.telnet.TelnetConsole": None,
64
+ # }
65
+
66
+ # Configure item pipelines
67
+ # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
68
+ ITEM_PIPELINES = {
69
+ "danbooru_crawler.pipelines.PicsDownloadPipeline": 1,
70
+ # "scrapy.pipelines.PicsDownloadPipeline": 1,
71
+ }
72
+ IMAGES_STORE = "./pics"
73
+
74
+ # Enable and configure the AutoThrottle extension (disabled by default)
75
+ # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
76
+ # AUTOTHROTTLE_ENABLED = True
77
+ # The initial download delay
78
+ # AUTOTHROTTLE_START_DELAY = 5
79
+ # The maximum download delay to be set in case of high latencies
80
+ # AUTOTHROTTLE_MAX_DELAY = 60
81
+ # The average number of requests Scrapy should be sending in parallel to
82
+ # each remote server
83
+ # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
84
+ # Enable showing throttling stats for every response received:
85
+ # AUTOTHROTTLE_DEBUG = False
86
+
87
+ # Enable and configure HTTP caching (disabled by default)
88
+ # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
89
+ # HTTPCACHE_ENABLED = True
90
+ # HTTPCACHE_EXPIRATION_SECS = 0
91
+ # HTTPCACHE_DIR = "httpcache"
92
+ # HTTPCACHE_IGNORE_HTTP_CODES = []
93
+ # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
94
+
95
+ # Set settings whose default value is deprecated to a future-proof value
96
+ REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
97
+ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
98
+ FEED_EXPORT_ENCODING = "utf-8"
danbooru_cawler/danbooru_crawler/spiders/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # This package will contain the spiders of your Scrapy project
2
+ #
3
+ # Please refer to the documentation for information on how to create and manage
4
+ # your spiders.
danbooru_cawler/danbooru_crawler/spiders/danbooru.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import scrapy
2
+ from .. import settings
3
+ from .. import items
4
+ from urllib import parse
5
+
6
+
7
+ class DanbooruSpider(scrapy.Spider):
8
+ # custom_settings = {"DOWNLOAD_DELAY": 0.3, "RANDOMIZE_DOWNLOAD_DELAY": True}
9
+
10
+ name = "danbooru"
11
+ allowed_domains = ["donmai.us"]
12
+ url = "https://danbooru.donmai.us"
13
+ start_urls = [f"https://danbooru.donmai.us/posts?tags={settings.SEARCH_TAG}"]
14
+
15
+ def parse(self, response):
16
+ """搜索页"""
17
+ # 详情页
18
+ for _ in response.xpath('//a[@class="post-preview-link"]/@href').getall():
19
+ self.logger.info(f"详情页 {parse.urljoin(self.url, _)}")
20
+ yield response.follow(_, callback=self.parse_pic_page_url, priority=1)
21
+ # 下一页
22
+ next_url = response.xpath('//a[@rel="next" and @href]/@href').get()
23
+ if next_url is not None:
24
+ self.logger.info(f"下一页 {parse.urljoin(self.url,next_url)}")
25
+ yield response.follow(next_url, callback=self.parse, priority=2)
26
+
27
+ def parse_pic_page_url(self, response):
28
+ """详情页"""
29
+ self.logger.info(f"详情页 {response.url}")
30
+ img_url = None
31
+ # 套图
32
+ if settings.SEARCH_LINK:
33
+ link_img_url_list = response.xpath(
34
+ '//div/article[not(contains(@class,"current-post"))]/div/a/@href'
35
+ ).getall()
36
+ for _ in link_img_url_list:
37
+ yield response.follow(_, callback=self.parse_pic_page_url, priority=1)
38
+
39
+ # 抓取图片源地址
40
+ if settings.SEARCH_TYPE == 0:
41
+ img_url = response.xpath("//section/picture/source/@srcset").get()
42
+ elif settings.SEARCH_TYPE == 1:
43
+ img_url = response.xpath(
44
+ '//li/a[@class="image-view-original-link"]/@href'
45
+ ).get()
46
+ if img_url is None:
47
+ img_url = response.xpath("//section/picture/source/@srcset").get()
48
+ self.logger.info(f"资源url {img_url}")
49
+ tags = response.xpath("//@data-tags").get()
50
+ name = img_url.split("/")[-1].split(".")[0]
51
+ with open(f"./pics/{name}.txt", "w") as txt_file:
52
+ tags = tags.split(" ")
53
+ tags = ",".join(tags)
54
+ txt_file.write(tags)
55
+ img_items = items.ImagedownloadItem()
56
+ img_items["image_urls"] = [img_url]
57
+ img_items["image_name"] = name
58
+ yield img_items
danbooru_cawler/main.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from scrapy.cmdline import execute
2
+ import os
3
+ import sys
4
+
5
+ if __name__ == "__main__":
6
+ # sys.path.append(os.path.dirname(os.path.abspath(__file__)))
7
+ execute(["scrapy", "crawl", "danbooru"])
danbooru_cawler/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ itemadapter==0.8.0
2
+ Scrapy==2.11.0
danbooru_cawler/scrapy.cfg ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Automatically created by: scrapy startproject
2
+ #
3
+ # For more information about the [deploy] section see:
4
+ # https://scrapyd.readthedocs.io/en/latest/deploy.html
5
+
6
+ [settings]
7
+ default = danbooru_crawler.settings
8
+
9
+ [deploy]
10
+ #url = http://localhost:6800/
11
+ project = danbooru_crawler