File size: 4,251 Bytes
df7271b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import requests
from bs4 import BeautifulSoup
from langchain.document_loaders import UnstructuredURLLoader
from langchain_core.documents.base import Document
from urllib.parse import urlparse

# url = input("Insert Link That You Want to Scrape:")

def scrape_cnn(url):
    response = requests.get(url)
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        result = soup.find_all(class_="detail-wrap flex gap-4 relative")

        # Clean up and concatenate the text using a for loop
        cleaned_text_list = []
        for element in result:
            cleaned_text = element.get_text().replace('\n', '').strip()
            cleaned_text_list.append(cleaned_text)

        # Join the cleaned text from the list
        all_text = " ".join(cleaned_text_list)

        # # Print or use the cleaned and concatenated text
        # print(all_text)

        # # Write the result to a text file
        # with open("result.txt", "w", encoding="utf-8") as f:
        #     f.write(all_text)

        return all_text
    else:
        print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")

def scrape_kompas(url):
    response = requests.get(url)
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        scripts = soup.find_all('script')
        for script in scripts:
            script_text = script.get_text()
            if "var keywordBrandSafety" in script_text:
                result = script_text
        result = result.replace ("var keywordBrandSafety =", "").strip().strip('"').strip('";')
        return result
    else:
        print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")
  
def scrape_detik(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        results = soup.find_all(class_='detail__body-text itp_bodycontent')
        # Extract and return the text from each element
        cleaned_text_list = []
        for element in results:
            text = element.get_text().replace('\n', '').strip()
            cleaned_text_list.append(text)
        
        # Join the cleaned text from the list
        all_text = " ".join(cleaned_text_list)

        return all_text
    else:
        print(f"Failed to retrieve the webpage. Status Code: {response.status_code}")

def document_instance(link, content):
    document_instance = Document(
        metadata= {'source':link},
        page_content=content
    )
    return document_instance

def scrape_cnn_instance(url):
    content = scrape_cnn(url)
    return (document_instance(url, content))

def scrape_kompas_instance(url):
    content = scrape_kompas(url)
    return (document_instance(url, content))

def scrape_detik_instance(url):
    content = scrape_detik(url)
    return (document_instance(url, content))

def scraping_pipeline(links:list):
    result = []
    for link in links:
        parsed_url = urlparse(link)
        domain = parsed_url.netloc

        # filter for detik links
        if "detik.com" in domain:
            result.append(scrape_detik_instance(link))

        # filter for cnn
        elif "cnnindonesia.com" in domain:
            result.append(scrape_cnn_instance(link))
        
        # filter for kompas
        elif "kompas.com" in domain:
            result.append(scrape_kompas_instance(link))
        
        else:
            print(f"Failed to retrieve the webpage. because your domain was {domain}")
    return result

def langchain_url(url):
    loader = UnstructuredURLLoader([url])
    data = loader.load()
    return data


links = [
    'https://www.cnnindonesia.com/ekonomi/20231221152333-78-1040259/rupiah-merosot-ke-rp15525-jelang-rilis-data-inflasi-as',
    'https://www.cnnindonesia.com/olahraga/20231221131224-142-1040147/mohamed-salah-vs-arsenal-tajam-dan-lebih-sering-menang',
    'https://finance.detik.com/infrastruktur/d-7101502/ini-bocoran-konglomerat-yang-bakal-susul-aguan-cs-investasi-di-ikn'
]

if __name__ == "__main__":
    print(scraping_pipeline(links =links))