fun/pagescraper.py

    pagescraper.py
	
			# I just copied the workflow from

			# https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup
		
			# What this file should do, is get all the links that are present in an HTML
		
			# file I'm giving it, and then tell me all the links it could find. Those links
		
			# are then searched and scraped in their entirety, and copied to a directory of
		
			# my choice.
		
			import httplib2
		
			from bs4 import BeautifulSoup, SoupStrainer
		
			import requests
		
			import time
		
			import os
		
			from os.path import isfile, join
		
			url_prefix = "https://verkiezingsuitgaven.vlaanderen.be/"
		
			root_path = "/tmp/verkiezingen/"
		
			base_html_files = [f for f in os.listdir(root_path) if isfile(join(root_path, f))]
		
			for file_path in base_html_files:
		
			    useful_links = []
		
			    with open(root_path + file_path, 'r') as f:
		
			        content = f.read()
		
			        for link in BeautifulSoup(content, 'html.parser', parse_only=SoupStrainer('a')):
		
			            if link.has_attr('href'):
		
			                if link['href'].startswith(url_prefix) and link['href'] != url_prefix and "publieke-opvraging" not in link['href']:
		
			                    useful_links.append(link['href'])
		
			                    print(link['href'])
		
			        #print(useful_links)
		
			    # We have now collected all URLs that we're interested in. Next up is
		
			    # creating the directory (if it doesn't already exist) for this
		
			    # municipality/province/district, and add the HTML files (if they don't
		
			    # already exist)
		
			    subdir_name = file_path[:-5]  # remove trailing ".html"
		
			    if subdir_name not in os.listdir(root_path):  # Create directory if not existing
		
			        os.mkdir(root_path + subdir_name)
		
			        print("CREATED!")
		
			    for useful_link in useful_links:
		
			        # Sleeping for 1 second as to not trigger the Cloudflare shit
		
			        time.sleep(1)
		
			        http = httplib2.Http()
		
			        #_, response = http.request(useful_link)
		
			        # Using a bogus header is necessary because they decided to use
		
			        # Cloudflare which cakes its pants without one.
		
			        header = {"user-agent":"Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/119.0"}
		
			        response = requests.get(useful_link, headers=header).text
		
			        print(f"VISITED {useful_link}")
		
			        # Decide if this is a party expense or a candidate expense:
		
			        if useful_link.count('/') == 3:  # Party expense
		
			            party_name = useful_link.rpartition('/')[2]
		
			            if party_name not in os.listdir(root_path + subdir_name):  # Create directory if not existing
		
			                os.mkdir(root_path + subdir_name +'/'+ party_name)
		
			            with open(root_path + subdir_name +'/'+ party_name +".html", 'w') as g:
		
			                g.write(response)
		
			        elif useful_link.count('/') == 4:  # Candidate expense
		
			            cand_name = useful_link.split('/')[-1]
		
			            party_name = useful_link.split('/')[-2]
		
			            if party_name not in os.listdir(root_path + subdir_name):  # Create directory if not existing
		
			                os.mkdir(root_path + subdir_name +'/'+ party_name)
		
			            with open(root_path + subdir_name +'/'+ party_name +'/'+ cand_name +".html", 'w') as g:
		
			                g.write(response)

1	# I just copied the workflow from
2	# https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup
3	# What this file should do, is get all the links that are present in an HTML
4	# file I'm giving it, and then tell me all the links it could find. Those links
5	# are then searched and scraped in their entirety, and copied to a directory of
6	# my choice.
7
8	import httplib2
9	from bs4 import BeautifulSoup, SoupStrainer
10	import requests
11	import time
12
13	import os
14	from os.path import isfile, join
15
16	url_prefix = "https://verkiezingsuitgaven.vlaanderen.be/"
17	root_path = "/tmp/verkiezingen/"
18
19	base_html_files = [f for f in os.listdir(root_path) if isfile(join(root_path, f))]
20
21
22	for file_path in base_html_files:
23	useful_links = []
24	with open(root_path + file_path, 'r') as f:
25	content = f.read()
26	for link in BeautifulSoup(content, 'html.parser', parse_only=SoupStrainer('a')):
27	if link.has_attr('href'):
28	if link['href'].startswith(url_prefix) and link['href'] != url_prefix and "publieke-opvraging" not in link['href']:
29	useful_links.append(link['href'])
30	print(link['href'])
31	#print(useful_links)
32	# We have now collected all URLs that we're interested in. Next up is
33	# creating the directory (if it doesn't already exist) for this
34	# municipality/province/district, and add the HTML files (if they don't
35	# already exist)
36	subdir_name = file_path[:-5] # remove trailing ".html"
37	if subdir_name not in os.listdir(root_path): # Create directory if not existing
38	os.mkdir(root_path + subdir_name)
39	print("CREATED!")
40	for useful_link in useful_links:
41	# Sleeping for 1 second as to not trigger the Cloudflare shit
42	time.sleep(1)
43	http = httplib2.Http()
44	#_, response = http.request(useful_link)
45	# Using a bogus header is necessary because they decided to use
46	# Cloudflare which cakes its pants without one.
47	header = {"user-agent":"Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/119.0"}
48	response = requests.get(useful_link, headers=header).text
49	print(f"VISITED {useful_link}")
50	# Decide if this is a party expense or a candidate expense:
51	if useful_link.count('/') == 3: # Party expense
52	party_name = useful_link.rpartition('/')[2]
53	if party_name not in os.listdir(root_path + subdir_name): # Create directory if not existing
54	os.mkdir(root_path + subdir_name +'/'+ party_name)
55	with open(root_path + subdir_name +'/'+ party_name +".html", 'w') as g:
56	g.write(response)
57
58	elif useful_link.count('/') == 4: # Candidate expense
59	cand_name = useful_link.split('/')[-1]
60	party_name = useful_link.split('/')[-2]
61	if party_name not in os.listdir(root_path + subdir_name): # Create directory if not existing
62	os.mkdir(root_path + subdir_name +'/'+ party_name)
63	with open(root_path + subdir_name +'/'+ party_name +'/'+ cand_name +".html", 'w') as g:
64	g.write(response)
65