Last posts

Water quality from SINAC in Spain

May 28, 2022 / maestro / Uncategorized

SINAC is the database of water consumption quality of water in Spain. It is a quite nice initiative that releases the data once a year.
Of course, it can be consulted if you know what to search for.
However, if you want to query What is the oldest analysis run in any of the data sources, then there is no answer.
So with a little trick, you can scrap all the data and download the 113 Mb. (download here)
As a first result, you can see here the number of analyzed parameters per last year of analysis.

Raw data per year (2022 till March)
2003 872
2004 2624
2005 3146
2006 5812
2007 6546
2008 9655
2009 9892
2010 11952
2011 19640
2012 21505
2013 24087
2014 19496
2015 43178
2016 30688
2017 30008
2018 52191
2019 81499
2020 89473
2021 497280
2022 140682

The code for the scraping the pages


import requests
limit = 23000
for i in range(limit):
print(limit-i)
url = "https://sinacv2.sanidad.gob.es/CiudadanoWeb/ciudadano/informacionAbastecimientoActionCA.do?idRed=" + str(i)
page = requests.get(url)
filePath = "pages/page" + str(i) +".html"
with open(filePath, "w") as file:
file.write(str(page.content))

and later for processing the pages and to create a large json file

from bs4 import BeautifulSoup
import os
import json
def echo(variablename, variable):
print("____________________________")
print(variablename + " = " + str(variable))
print("*****************************")
def cleanhtml(text):
output = text
cleandict = [
{"chain": "\\xc3\\xad", "replacement": "i"},
{"chain": "\\xc3\\xa1", "replacement": "a"},
{"chain": "\\xc3\\xa9", "replacement": "e"},
{"chain": "\xc3\xa1", "replacement": "e"},
{"chain": "\xc3\xb3", "replacement": "o"},
{"chain": "\\xc3\\xb3", "replacement": "o"},
{"chain": "\\t", "replacement": ""},
{"chain": "\\r", "replacement": ""}
]
for element in cleandict:
if element["chain"] in text:
output = output.replace(element["chain"], element["replacement"])
return output
# main repository of information
sinac = {"data": {}, "date": "27/03/2022"}
# where the retrieved pages are store for late processing
mypath = "./pages"
# codes of the tables where the analysis information is contained
tableCodes = ["rowIndic", "rowMicro", "rowPlag", "rowQuim"]
# files with the source code of the pages
onlyfiles = [f for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
outputFileSinac = "sinac.json"
for fileInDir in onlyfiles:
echo("fileInDir", fileInDir)
index = fileInDir.replace("page", "").replace(".html", "")
# reading the source page file
with open(mypath + "/" + fileInDir, "r") as file:
content = file.read()[2:]
print(content)
soup = BeautifulSoup(content, 'html.parser')
echo("soup", soup)
# looking for the main data of the water source
sinac["data"][index] = {}
sourceKeys = []
sourceValues = []
mainDataHtml = soup.find("div", attrs={"class": "bloqueTabla"})
if mainDataHtml is None:
continue
mainDataHeader = mainDataHtml.findAll("th")
# retrieve the headers of the main data about the water data source
for item in mainDataHeader:
sourceKeys.append(cleanhtml(str(item.text)))
print(sourceKeys[:-1])
# retrieve the values of the main data about of the water data source
mainDataRows = mainDataHtml.findAll("td")
for item in mainDataRows:
sourceValues.append(cleanhtml(str(item.text)))
print(sourceValues[:-1])
# create the dict to attached to the output
sourceDict = dict(zip(sourceKeys, sourceValues))
sinac["data"][index] = sourceDict
print(sourceDict)
# looking for the analysis tables of the water source in the source code
sinac["data"][index]["analisis"] = []
analysis = {}
for table in tableCodes:
print("table = " + table)
tableHtml = soup.find("table", attrs={"id": table})
echo("tableHtml", tableHtml)
# some pages have not tables
if tableHtml is None:
print("table " + table + "is not found at file " + fileInDir)
analysis["type"] = table
analysis["data"] = "Not available"
else:
analysis = {"type": table, "data": []}
# some other they do
tableHtmlData = tableHtml.find_all("tr")
heading = []
echo("tableHtmlData", tableHtmlData)
for td in tableHtmlData[0].find_all("th"):
echo("td", td)
# getting headers
heading.append(cleanhtml(str(td.text)))
echo("heading", heading)
values = []
for row in tableHtmlData[1:]:
echo("row", row)
# remove any newlines and extra spaces from left and right
rowValues = row.find_all("td")
echo("rowValues", rowValues)
values = []
for value in rowValues:
values.append(cleanhtml(str(value.text)))
echo("values", values)
row = {}
for counter, element in enumerate(values):
echo("element", element)
echo("counter", counter)
echo("heading[counter]", heading[counter])
row[heading[counter]] = element
analysis["data"].append(row)
sinac["data"][index]["analisis"].append(analysis)
echo("analysis", analysis)
with open(outputFileSinac, "w") as sinacFile:
sinacFile.write(json.dumps(sinac))

Comments are currently closed.