fun

pagescraper.py

1
# I just copied the workflow from
2
# https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup
3
# What this file should do, is get all the links that are present in an HTML
4
# file I'm giving it, and then tell me all the links it could find. Those links
5
# are then searched and scraped in their entirety, and copied to a directory of
6
# my choice.
7
8
import httplib2
9
from bs4 import BeautifulSoup, SoupStrainer
10
import requests
11
import time
12
13
import os
14
from os.path import isfile, join
15
16
url_prefix = "https://verkiezingsuitgaven.vlaanderen.be/"
17
root_path = "/tmp/verkiezingen/"
18
19
base_html_files = [f for f in os.listdir(root_path) if isfile(join(root_path, f))]
20
21
22
for file_path in base_html_files:
23
    useful_links = []
24
    with open(root_path + file_path, 'r') as f:
25
        content = f.read()
26
        for link in BeautifulSoup(content, 'html.parser', parse_only=SoupStrainer('a')):
27
            if link.has_attr('href'):
28
                if link['href'].startswith(url_prefix) and link['href'] != url_prefix and "publieke-opvraging" not in link['href']:
29
                    useful_links.append(link['href'])
30
                    print(link['href'])
31
        #print(useful_links)
32
    # We have now collected all URLs that we're interested in. Next up is
33
    # creating the directory (if it doesn't already exist) for this
34
    # municipality/province/district, and add the HTML files (if they don't
35
    # already exist)
36
    subdir_name = file_path[:-5]  # remove trailing ".html"
37
    if subdir_name not in os.listdir(root_path):  # Create directory if not existing
38
        os.mkdir(root_path + subdir_name)
39
        print("CREATED!")
40
    for useful_link in useful_links:
41
        # Sleeping for 1 second as to not trigger the Cloudflare shit
42
        time.sleep(1)
43
        http = httplib2.Http()
44
        #_, response = http.request(useful_link)
45
        # Using a bogus header is necessary because they decided to use
46
        # Cloudflare which cakes its pants without one.
47
        header = {"user-agent":"Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/119.0"}
48
        response = requests.get(useful_link, headers=header).text
49
        print(f"VISITED {useful_link}")
50
        # Decide if this is a party expense or a candidate expense:
51
        if useful_link.count('/') == 3:  # Party expense
52
            party_name = useful_link.rpartition('/')[2]
53
            if party_name not in os.listdir(root_path + subdir_name):  # Create directory if not existing
54
                os.mkdir(root_path + subdir_name +'/'+ party_name)
55
            with open(root_path + subdir_name +'/'+ party_name +".html", 'w') as g:
56
                g.write(response)
57
58
        elif useful_link.count('/') == 4:  # Candidate expense
59
            cand_name = useful_link.split('/')[-1]
60
            party_name = useful_link.split('/')[-2]
61
            if party_name not in os.listdir(root_path + subdir_name):  # Create directory if not existing
62
                os.mkdir(root_path + subdir_name +'/'+ party_name)
63
            with open(root_path + subdir_name +'/'+ party_name +'/'+ cand_name +".html", 'w') as g:
64
                g.write(response)
65