fun

+

# https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup

+

# What this file should do, is get all the links that are present in an HTML

+

# file I'm giving it, and then tell me all the links it could find. Those links

+

# are then searched and scraped in their entirety, and copied to a directory of

+

# my choice.

+

import httplib2

+

from bs4 import BeautifulSoup, SoupStrainer

+

import requests

+

import time

+

import os

+

from os.path import isfile, join

+

url_prefix = "https://verkiezingsuitgaven.vlaanderen.be/"

+

root_path = "/tmp/verkiezingen/"

+

base_html_files = [f for f in os.listdir(root_path) if isfile(join(root_path, f))]

+

for file_path in base_html_files:

+

    useful_links = []

+

    with open(root_path + file_path, 'r') as f:

+

        content = f.read()

+

        for link in BeautifulSoup(content, 'html.parser', parse_only=SoupStrainer('a')):

+

            if link.has_attr('href'):

+

                if link['href'].startswith(url_prefix) and link['href'] != url_prefix and "publieke-opvraging" not in link['href']:

+

                    useful_links.append(link['href'])

+

                    print(link['href'])

+

        #print(useful_links)

+

    # We have now collected all URLs that we're interested in. Next up is

+

    # creating the directory (if it doesn't already exist) for this

+

    # municipality/province/district, and add the HTML files (if they don't

+

    # already exist)

+

    subdir_name = file_path[:-5]  # remove trailing ".html"

+

    if subdir_name not in os.listdir(root_path):  # Create directory if not existing

+

        os.mkdir(root_path + subdir_name)

+

        print("CREATED!")

+

    for useful_link in useful_links:

+

        # Sleeping for 1 second as to not trigger the Cloudflare shit

+

        time.sleep(1)

+

        http = httplib2.Http()

+

        #_, response = http.request(useful_link)

+

        # Using a bogus header is necessary because they decided to use

+

        # Cloudflare which cakes its pants without one.

+

        header = {"user-agent":"Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/119.0"}

+

        response = requests.get(useful_link, headers=header).text

+

        print(f"VISITED {useful_link}")

+

        # Decide if this is a party expense or a candidate expense:

+

        if useful_link.count('/') == 3:  # Party expense

+

            party_name = useful_link.rpartition('/')[2]

+

            if party_name not in os.listdir(root_path + subdir_name):  # Create directory if not existing

+

                os.mkdir(root_path + subdir_name +'/'+ party_name)

+

            with open(root_path + subdir_name +'/'+ party_name +".html", 'w') as g:

+

                g.write(response)

+

        elif useful_link.count('/') == 4:  # Candidate expense

+

            cand_name = useful_link.split('/')[-1]

+

            party_name = useful_link.split('/')[-2]

+

            if party_name not in os.listdir(root_path + subdir_name):  # Create directory if not existing

+

                os.mkdir(root_path + subdir_name +'/'+ party_name)

+

            with open(root_path + subdir_name +'/'+ party_name +'/'+ cand_name +".html", 'w') as g:

+

                g.write(response)

+

Add quick scraping script for VL elections

pagescraper.py ¶

+	1	# https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup
+	2	# What this file should do, is get all the links that are present in an HTML
+	3	# file I'm giving it, and then tell me all the links it could find. Those links
+	4	# are then searched and scraped in their entirety, and copied to a directory of
+	5	# my choice.
+	6
+	7	import httplib2
+	8	from bs4 import BeautifulSoup, SoupStrainer
+	9	import requests
+	10	import time
+	11
+	12	import os
+	13	from os.path import isfile, join
+	14
+	15	url_prefix = "https://verkiezingsuitgaven.vlaanderen.be/"
+	16	root_path = "/tmp/verkiezingen/"
+	17
+	18	base_html_files = [f for f in os.listdir(root_path) if isfile(join(root_path, f))]
+	19
+	20
+	21	for file_path in base_html_files:
+	22	useful_links = []
+	23	with open(root_path + file_path, 'r') as f:
+	24	content = f.read()
+	25	for link in BeautifulSoup(content, 'html.parser', parse_only=SoupStrainer('a')):
+	26	if link.has_attr('href'):
+	27	if link['href'].startswith(url_prefix) and link['href'] != url_prefix and "publieke-opvraging" not in link['href']:
+	28	useful_links.append(link['href'])
+	29	print(link['href'])
+	30	#print(useful_links)
+	31	# We have now collected all URLs that we're interested in. Next up is
+	32	# creating the directory (if it doesn't already exist) for this
+	33	# municipality/province/district, and add the HTML files (if they don't
+	34	# already exist)
+	35	subdir_name = file_path[:-5] # remove trailing ".html"
+	36	if subdir_name not in os.listdir(root_path): # Create directory if not existing
+	37	os.mkdir(root_path + subdir_name)
+	38	print("CREATED!")
+	39	for useful_link in useful_links:
+	40	# Sleeping for 1 second as to not trigger the Cloudflare shit
+	41	time.sleep(1)
+	42	http = httplib2.Http()
+	43	#_, response = http.request(useful_link)
+	44	# Using a bogus header is necessary because they decided to use
+	45	# Cloudflare which cakes its pants without one.
+	46	header = {"user-agent":"Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/119.0"}
+	47	response = requests.get(useful_link, headers=header).text
+	48	print(f"VISITED {useful_link}")
+	49	# Decide if this is a party expense or a candidate expense:
+	50	if useful_link.count('/') == 3: # Party expense
+	51	party_name = useful_link.rpartition('/')[2]
+	52	if party_name not in os.listdir(root_path + subdir_name): # Create directory if not existing
+	53	os.mkdir(root_path + subdir_name +'/'+ party_name)
+	54	with open(root_path + subdir_name +'/'+ party_name +".html", 'w') as g:
+	55	g.write(response)
+	56
+	57	elif useful_link.count('/') == 4: # Candidate expense
+	58	cand_name = useful_link.split('/')[-1]
+	59	party_name = useful_link.split('/')[-2]
+	60	if party_name not in os.listdir(root_path + subdir_name): # Create directory if not existing
+	61	os.mkdir(root_path + subdir_name +'/'+ party_name)
+	62	with open(root_path + subdir_name +'/'+ party_name +'/'+ cand_name +".html", 'w') as g:
+	63	g.write(response)
+	64
+	65