Spaces:
Runtime error
Runtime error
Francesco
commited on
Commit
•
04242a9
0
Parent(s):
first commit
Browse files
data.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# def get_lyrics_url_from_website():
|
3 |
+
# # https://www.disneyclips.com/lyrics/
|
4 |
+
|
5 |
+
import aiohttp
|
6 |
+
import asyncio
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
|
9 |
+
from typing import List, TypedDict, Tuple, Optional
|
10 |
+
|
11 |
+
class Lyric(TypedDict):
|
12 |
+
name: str
|
13 |
+
text: str
|
14 |
+
|
15 |
+
class Movie(TypedDict):
|
16 |
+
title: str
|
17 |
+
lyrics: List[Lyric]
|
18 |
+
|
19 |
+
|
20 |
+
URL = "https://www.disneyclips.com/lyrics/"
|
21 |
+
|
22 |
+
|
23 |
+
async def get_lyrics_urls_from_movie_url(url: str, session: aiohttp.ClientSession) -> Optional[Tuple[str, str]]:
|
24 |
+
async with session.get(url) as response:
|
25 |
+
html = await response.text()
|
26 |
+
soup = BeautifulSoup(html, 'html.parser')
|
27 |
+
table = soup.find('table', {'class': 'songs'})
|
28 |
+
names_and_urls = None
|
29 |
+
if table:
|
30 |
+
links = table.find_all('a')
|
31 |
+
names_and_urls = []
|
32 |
+
for link in links:
|
33 |
+
names_and_urls.append((link.text, f"{URL}/{link.get('href')}"))
|
34 |
+
return names_and_urls
|
35 |
+
|
36 |
+
async def get_lyric_from_lyric_url(url: str, name: str, session: aiohttp.ClientSession) -> Lyric:
|
37 |
+
async with session.get(url) as response:
|
38 |
+
html = await response.text()
|
39 |
+
soup = BeautifulSoup(html, 'html.parser')
|
40 |
+
div = soup.find('div', {'id': 'cnt'}).find('div', {'class': 'main'})
|
41 |
+
paragraphs = div.find_all('p')
|
42 |
+
text = ""
|
43 |
+
for p in paragraphs:
|
44 |
+
text += p.text
|
45 |
+
return text
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
async def get_movie_names_and_urls(session: aiohttp.ClientSession) -> List[Tuple[str, str]]:
|
50 |
+
async with session.get(URL) as response:
|
51 |
+
html = await response.text()
|
52 |
+
soup = BeautifulSoup(html, 'html.parser')
|
53 |
+
links = soup.find('div', {'id': 'cnt'}).find('div', {'class': 'main'}).find_all('a')
|
54 |
+
movie_names_and_urls = [(link.text, f"{URL}/{link.get('href')}") for link in links]
|
55 |
+
return movie_names_and_urls
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
async def main():
|
61 |
+
async with aiohttp.ClientSession() as session:
|
62 |
+
names_and_urls = await get_movie_names_and_urls(session)
|
63 |
+
data = await asyncio.gather(*[asyncio.create_task(get_lyrics_urls_from_movie_url(names, url, session)) for (names, url) in names_and_urls])
|
64 |
+
|
65 |
+
loop = asyncio.get_event_loop()
|
66 |
+
loop.run_until_complete(main())
|