Add quick scraping script for VL elections
- Author
- Maarten Vangeneugden
- Date
- Nov. 18, 2024, 10:02 a.m.
- Hash
- 894b5b0ae458e91490702824908117ba2ca22795
- Parent
- bd51d9f6f0580feae1b14ebe056cc1e678df9581
- Modified file
- pagescraper.py
pagescraper.py ¶
65 additions and 0 deletions.
View changes Hide changes
+ |
1 |
# https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup |
+ |
2 |
# What this file should do, is get all the links that are present in an HTML |
+ |
3 |
# file I'm giving it, and then tell me all the links it could find. Those links |
+ |
4 |
# are then searched and scraped in their entirety, and copied to a directory of |
+ |
5 |
# my choice. |
+ |
6 |
|
+ |
7 |
import httplib2 |
+ |
8 |
from bs4 import BeautifulSoup, SoupStrainer |
+ |
9 |
import requests |
+ |
10 |
import time |
+ |
11 |
|
+ |
12 |
import os |
+ |
13 |
from os.path import isfile, join |
+ |
14 |
|
+ |
15 |
url_prefix = "https://verkiezingsuitgaven.vlaanderen.be/" |
+ |
16 |
root_path = "/tmp/verkiezingen/" |
+ |
17 |
|
+ |
18 |
base_html_files = [f for f in os.listdir(root_path) if isfile(join(root_path, f))] |
+ |
19 |
|
+ |
20 |
|
+ |
21 |
for file_path in base_html_files: |
+ |
22 |
useful_links = [] |
+ |
23 |
with open(root_path + file_path, 'r') as f: |
+ |
24 |
content = f.read() |
+ |
25 |
for link in BeautifulSoup(content, 'html.parser', parse_only=SoupStrainer('a')): |
+ |
26 |
if link.has_attr('href'): |
+ |
27 |
if link['href'].startswith(url_prefix) and link['href'] != url_prefix and "publieke-opvraging" not in link['href']: |
+ |
28 |
useful_links.append(link['href']) |
+ |
29 |
print(link['href']) |
+ |
30 |
#print(useful_links) |
+ |
31 |
# We have now collected all URLs that we're interested in. Next up is |
+ |
32 |
# creating the directory (if it doesn't already exist) for this |
+ |
33 |
# municipality/province/district, and add the HTML files (if they don't |
+ |
34 |
# already exist) |
+ |
35 |
subdir_name = file_path[:-5] # remove trailing ".html" |
+ |
36 |
if subdir_name not in os.listdir(root_path): # Create directory if not existing |
+ |
37 |
os.mkdir(root_path + subdir_name) |
+ |
38 |
print("CREATED!") |
+ |
39 |
for useful_link in useful_links: |
+ |
40 |
# Sleeping for 1 second as to not trigger the Cloudflare shit |
+ |
41 |
time.sleep(1) |
+ |
42 |
http = httplib2.Http() |
+ |
43 |
#_, response = http.request(useful_link) |
+ |
44 |
# Using a bogus header is necessary because they decided to use |
+ |
45 |
# Cloudflare which cakes its pants without one. |
+ |
46 |
header = {"user-agent":"Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/119.0"} |
+ |
47 |
response = requests.get(useful_link, headers=header).text |
+ |
48 |
print(f"VISITED {useful_link}") |
+ |
49 |
# Decide if this is a party expense or a candidate expense: |
+ |
50 |
if useful_link.count('/') == 3: # Party expense |
+ |
51 |
party_name = useful_link.rpartition('/')[2] |
+ |
52 |
if party_name not in os.listdir(root_path + subdir_name): # Create directory if not existing |
+ |
53 |
os.mkdir(root_path + subdir_name +'/'+ party_name) |
+ |
54 |
with open(root_path + subdir_name +'/'+ party_name +".html", 'w') as g: |
+ |
55 |
g.write(response) |
+ |
56 |
|
+ |
57 |
elif useful_link.count('/') == 4: # Candidate expense |
+ |
58 |
cand_name = useful_link.split('/')[-1] |
+ |
59 |
party_name = useful_link.split('/')[-2] |
+ |
60 |
if party_name not in os.listdir(root_path + subdir_name): # Create directory if not existing |
+ |
61 |
os.mkdir(root_path + subdir_name +'/'+ party_name) |
+ |
62 |
with open(root_path + subdir_name +'/'+ party_name +'/'+ cand_name +".html", 'w') as g: |
+ |
63 |
g.write(response) |
+ |
64 |
|
+ |
65 |