Spaces:
Sleeping
Sleeping
feat: Add script to download content from Wikidocs
Browse files- download_wikidocs.py +74 -0
- requirements.txt +4 -1
download_wikidocs.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import re
|
5 |
+
from markdownify import markdownify as md
|
6 |
+
import pandas as pd
|
7 |
+
import argparse
|
8 |
+
|
9 |
+
|
10 |
+
def extract_content(url: str):
|
11 |
+
response = requests.get(url)
|
12 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
13 |
+
|
14 |
+
page_subject = soup.select_one("#load_content .page-subject")
|
15 |
+
page_content = soup.select_one("#load_content .page-content")
|
16 |
+
markdown_content = md(
|
17 |
+
str(page_subject) + str(page_content),
|
18 |
+
heading_style="ATX",
|
19 |
+
bullets="-",
|
20 |
+
strong_em_symbol="*",
|
21 |
+
code_language="python",
|
22 |
+
escape_asterisks=False,
|
23 |
+
escape_underscores=False,
|
24 |
+
)
|
25 |
+
normalized_text = re.sub(r"\n{2}", "\n", markdown_content)
|
26 |
+
|
27 |
+
return normalized_text
|
28 |
+
|
29 |
+
|
30 |
+
def main(ebook_url):
|
31 |
+
base_url = "https://wikidocs.net"
|
32 |
+
|
33 |
+
# book_id ์ถ์ถ
|
34 |
+
book_id = ebook_url.split("/")[-1]
|
35 |
+
|
36 |
+
# ํ์ด์ง ์์ค ๊ฐ์ ธ์ค๊ธฐ
|
37 |
+
response = requests.get(ebook_url)
|
38 |
+
response.raise_for_status() # ์์ธ ์ฒ๋ฆฌ
|
39 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
40 |
+
|
41 |
+
# ๋ชฉ์ฐจ์์ 'a' ํ๊ทธ๋ง ๊ฐ์ ธ์ค๊ธฐ
|
42 |
+
toc = soup.select(".list-group-toc a[href^='javascript:page(']")
|
43 |
+
|
44 |
+
# ์ถ์ถํ ๋ฐ์ดํฐ ์ ์ฅํ ๋ฆฌ์คํธ
|
45 |
+
data_list = []
|
46 |
+
for item in toc:
|
47 |
+
title = item.get_text(strip=True)
|
48 |
+
page_id = item.get("href").split("page(")[-1].rstrip(")")
|
49 |
+
link = f"{base_url}/{page_id}"
|
50 |
+
data_list.append({"title": title, "link": link})
|
51 |
+
|
52 |
+
# ๋ฐ์ดํฐ ๋ฆฌ์คํธ๋ฅผ ์ํํ๋ฉฐ ์ฝํ
์ธ ์ถ์ถ
|
53 |
+
for item in data_list[1:]:
|
54 |
+
item["content"] = extract_content(item["link"])
|
55 |
+
time.sleep(1) # ํ์ด์ง ๋ก๋๋ฅผ ์ํด ๋๊ธฐ
|
56 |
+
|
57 |
+
# ๋ฐ์ดํฐํ๋ ์์ผ๋ก ๋ณํ
|
58 |
+
df = pd.DataFrame(data_list)
|
59 |
+
df = df.dropna(subset=["content"])
|
60 |
+
|
61 |
+
# ๋ฐ์ดํฐํ๋ ์์ parquet ํ์ผ๋ก ์ ์ฅ
|
62 |
+
parquet_filename = f"wikidocs_{book_id}.parquet"
|
63 |
+
df.to_parquet(parquet_filename, index=False)
|
64 |
+
|
65 |
+
print(f"ํ์ผ์ด ์ฑ๊ณต์ ์ผ๋ก ์ ์ฅ๋์์ต๋๋ค: {parquet_filename}")
|
66 |
+
|
67 |
+
|
68 |
+
if __name__ == "__main__":
|
69 |
+
# ๋ช
๋ น์ด ์ค ์ธ์ ์ฒ๋ฆฌ
|
70 |
+
parser = argparse.ArgumentParser(description="Wikidocs ebook URL์ ์
๋ ฅํ์ธ์.")
|
71 |
+
parser.add_argument("ebook_url", type=str, help="Wikidocs ebook URL")
|
72 |
+
args = parser.parse_args()
|
73 |
+
|
74 |
+
main(args.ebook_url)
|
requirements.txt
CHANGED
@@ -12,4 +12,7 @@ langchain-openai
|
|
12 |
langchain-core
|
13 |
langchain-groq
|
14 |
langchain_cohere
|
15 |
-
chromadb
|
|
|
|
|
|
|
|
12 |
langchain-core
|
13 |
langchain-groq
|
14 |
langchain_cohere
|
15 |
+
chromadb
|
16 |
+
markdownify
|
17 |
+
pandas
|
18 |
+
beautifulsoup4
|