fun

Add quick scraping script for VL elections

Author
Maarten Vangeneugden
Date
Nov. 18, 2024, 9:02 a.m.
Hash
894b5b0ae458e91490702824908117ba2ca22795
Parent
bd51d9f6f0580feae1b14ebe056cc1e678df9581
Modified file
pagescraper.py

pagescraper.py

65 additions and 0 deletions.

View changes Hide changes
+
1
# https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup
+
2
# What this file should do, is get all the links that are present in an HTML
+
3
# file I'm giving it, and then tell me all the links it could find. Those links
+
4
# are then searched and scraped in their entirety, and copied to a directory of
+
5
# my choice.
+
6
+
7
import httplib2
+
8
from bs4 import BeautifulSoup, SoupStrainer
+
9
import requests
+
10
import time
+
11
+
12
import os
+
13
from os.path import isfile, join
+
14
+
15
url_prefix = "https://verkiezingsuitgaven.vlaanderen.be/"
+
16
root_path = "/tmp/verkiezingen/"
+
17
+
18
base_html_files = [f for f in os.listdir(root_path) if isfile(join(root_path, f))]
+
19
+
20
+
21
for file_path in base_html_files:
+
22
    useful_links = []
+
23
    with open(root_path + file_path, 'r') as f:
+
24
        content = f.read()
+
25
        for link in BeautifulSoup(content, 'html.parser', parse_only=SoupStrainer('a')):
+
26
            if link.has_attr('href'):
+
27
                if link['href'].startswith(url_prefix) and link['href'] != url_prefix and "publieke-opvraging" not in link['href']:
+
28
                    useful_links.append(link['href'])
+
29
                    print(link['href'])
+
30
        #print(useful_links)
+
31
    # We have now collected all URLs that we're interested in. Next up is
+
32
    # creating the directory (if it doesn't already exist) for this
+
33
    # municipality/province/district, and add the HTML files (if they don't
+
34
    # already exist)
+
35
    subdir_name = file_path[:-5]  # remove trailing ".html"
+
36
    if subdir_name not in os.listdir(root_path):  # Create directory if not existing
+
37
        os.mkdir(root_path + subdir_name)
+
38
        print("CREATED!")
+
39
    for useful_link in useful_links:
+
40
        # Sleeping for 1 second as to not trigger the Cloudflare shit
+
41
        time.sleep(1)
+
42
        http = httplib2.Http()
+
43
        #_, response = http.request(useful_link)
+
44
        # Using a bogus header is necessary because they decided to use
+
45
        # Cloudflare which cakes its pants without one.
+
46
        header = {"user-agent":"Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/119.0"}
+
47
        response = requests.get(useful_link, headers=header).text
+
48
        print(f"VISITED {useful_link}")
+
49
        # Decide if this is a party expense or a candidate expense:
+
50
        if useful_link.count('/') == 3:  # Party expense
+
51
            party_name = useful_link.rpartition('/')[2]
+
52
            if party_name not in os.listdir(root_path + subdir_name):  # Create directory if not existing
+
53
                os.mkdir(root_path + subdir_name +'/'+ party_name)
+
54
            with open(root_path + subdir_name +'/'+ party_name +".html", 'w') as g:
+
55
                g.write(response)
+
56
+
57
        elif useful_link.count('/') == 4:  # Candidate expense
+
58
            cand_name = useful_link.split('/')[-1]
+
59
            party_name = useful_link.split('/')[-2]
+
60
            if party_name not in os.listdir(root_path + subdir_name):  # Create directory if not existing
+
61
                os.mkdir(root_path + subdir_name +'/'+ party_name)
+
62
            with open(root_path + subdir_name +'/'+ party_name +'/'+ cand_name +".html", 'w') as g:
+
63
                g.write(response)
+
64
+
65