1 |
# I just copied the workflow from
|
2 |
# https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup |
3 |
# What this file should do, is get all the links that are present in an HTML |
4 |
# file I'm giving it, and then tell me all the links it could find. Those links |
5 |
# are then searched and scraped in their entirety, and copied to a directory of |
6 |
# my choice. |
7 |
|
8 |
import httplib2 |
9 |
from bs4 import BeautifulSoup, SoupStrainer |
10 |
import requests |
11 |
import time |
12 |
|
13 |
import os |
14 |
from os.path import isfile, join |
15 |
|
16 |
url_prefix = "https://verkiezingsuitgaven.vlaanderen.be/" |
17 |
root_path = "/tmp/verkiezingen/" |
18 |
|
19 |
base_html_files = [f for f in os.listdir(root_path) if isfile(join(root_path, f))] |
20 |
|
21 |
|
22 |
for file_path in base_html_files: |
23 |
useful_links = [] |
24 |
with open(root_path + file_path, 'r') as f: |
25 |
content = f.read() |
26 |
for link in BeautifulSoup(content, 'html.parser', parse_only=SoupStrainer('a')): |
27 |
if link.has_attr('href'): |
28 |
if link['href'].startswith(url_prefix) and link['href'] != url_prefix and "publieke-opvraging" not in link['href']: |
29 |
useful_links.append(link['href']) |
30 |
print(link['href']) |
31 |
#print(useful_links) |
32 |
# We have now collected all URLs that we're interested in. Next up is |
33 |
# creating the directory (if it doesn't already exist) for this |
34 |
# municipality/province/district, and add the HTML files (if they don't |
35 |
# already exist) |
36 |
subdir_name = file_path[:-5] # remove trailing ".html" |
37 |
if subdir_name not in os.listdir(root_path): # Create directory if not existing |
38 |
os.mkdir(root_path + subdir_name) |
39 |
print("CREATED!") |
40 |
for useful_link in useful_links: |
41 |
# Sleeping for 1 second as to not trigger the Cloudflare shit |
42 |
time.sleep(1) |
43 |
http = httplib2.Http() |
44 |
#_, response = http.request(useful_link) |
45 |
# Using a bogus header is necessary because they decided to use |
46 |
# Cloudflare which cakes its pants without one. |
47 |
header = {"user-agent":"Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/119.0"} |
48 |
response = requests.get(useful_link, headers=header).text |
49 |
print(f"VISITED {useful_link}") |
50 |
# Decide if this is a party expense or a candidate expense: |
51 |
if useful_link.count('/') == 3: # Party expense |
52 |
party_name = useful_link.rpartition('/')[2] |
53 |
if party_name not in os.listdir(root_path + subdir_name): # Create directory if not existing |
54 |
os.mkdir(root_path + subdir_name +'/'+ party_name) |
55 |
with open(root_path + subdir_name +'/'+ party_name +".html", 'w') as g: |
56 |
g.write(response) |
57 |
|
58 |
elif useful_link.count('/') == 4: # Candidate expense |
59 |
cand_name = useful_link.split('/')[-1] |
60 |
party_name = useful_link.split('/')[-2] |
61 |
if party_name not in os.listdir(root_path + subdir_name): # Create directory if not existing |
62 |
os.mkdir(root_path + subdir_name +'/'+ party_name) |
63 |
with open(root_path + subdir_name +'/'+ party_name +'/'+ cand_name +".html", 'w') as g: |
64 |
g.write(response) |
65 |
|