Water quality from SINAC in Spain

May 28, 2022 / maestro / Uncategorized

SINAC is the database of water consumption quality of water in Spain. It is a quite nice initiative that releases the data once a year.
Of course, it can be consulted if you know what to search for.
However, if you want to query What is the oldest analysis run in any of the data sources, then there is no answer.
So with a little trick, you can scrap all the data and download the 113 Mb. (download here)
As a first result, you can see here the number of analyzed parameters per last year of analysis.

Raw data per year (2022 till March)
2003 872
2004 2624
2005 3146
2006 5812
2007 6546
2008 9655
2009 9892
2010 11952
2011 19640
2012 21505
2013 24087
2014 19496
2015 43178
2016 30688
2017 30008
2018 52191
2019 81499
2020 89473
2021 497280
2022 140682

The code for the scraping the pages

import requests limit = 23000 for i in range(limit): print(limit-i) url = "https://sinacv2.sanidad.gob.es/CiudadanoWeb/ciudadano/informacionAbastecimientoActionCA.do?idRed=" + str(i) page = requests.get(url) filePath = "pages/page" + str(i) +".html" with open(filePath, "w") as file: file.write(str(page.content))

and later for processing the pages and to create a large json file
from bs4 import BeautifulSoup import os import json def echo(variablename, variable): print("____________________________") print(variablename + " = " + str(variable)) print("*****************************") def cleanhtml(text): output = text cleandict = [ {"chain": "\\xc3\\xad", "replacement": "i"}, {"chain": "\\xc3\\xa1", "replacement": "a"}, {"chain": "\\xc3\\xa9", "replacement": "e"}, {"chain": "\xc3\xa1", "replacement": "e"}, {"chain": "\xc3\xb3", "replacement": "o"}, {"chain": "\\xc3\\xb3", "replacement": "o"}, {"chain": "\\t", "replacement": ""}, {"chain": "\\r", "replacement": ""} ] for element in cleandict: if element["chain"] in text: output = output.replace(element["chain"], element["replacement"]) return output # main repository of information sinac = {"data": {}, "date": "27/03/2022"} # where the retrieved pages are store for late processing mypath = "./pages" # codes of the tables where the analysis information is contained tableCodes = ["rowIndic", "rowMicro", "rowPlag", "rowQuim"] # files with the source code of the pages onlyfiles = [f for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))] outputFileSinac = "sinac.json" for fileInDir in onlyfiles: echo("fileInDir", fileInDir) index = fileInDir.replace("page", "").replace(".html", "") # reading the source page file with open(mypath + "/" + fileInDir, "r") as file: content = file.read()[2:] print(content) soup = BeautifulSoup(content, 'html.parser') echo("soup", soup) # looking for the main data of the water source sinac["data"][index] = {} sourceKeys = [] sourceValues = [] mainDataHtml = soup.find("div", attrs={"class": "bloqueTabla"}) if mainDataHtml is None: continue mainDataHeader = mainDataHtml.findAll("th") # retrieve the headers of the main data about the water data source for item in mainDataHeader: sourceKeys.append(cleanhtml(str(item.text))) print(sourceKeys[:-1]) # retrieve the values of the main data about of the water data source mainDataRows = mainDataHtml.findAll("td") for item in mainDataRows: sourceValues.append(cleanhtml(str(item.text))) print(sourceValues[:-1]) # create the dict to attached to the output sourceDict = dict(zip(sourceKeys, sourceValues)) sinac["data"][index] = sourceDict print(sourceDict) # looking for the analysis tables of the water source in the source code sinac["data"][index]["analisis"] = [] analysis = {} for table in tableCodes: print("table = " + table) tableHtml = soup.find("table", attrs={"id": table}) echo("tableHtml", tableHtml) # some pages have not tables if tableHtml is None: print("table " + table + "is not found at file " + fileInDir) analysis["type"] = table analysis["data"] = "Not available" else: analysis = {"type": table, "data": []} # some other they do tableHtmlData = tableHtml.find_all("tr") heading = [] echo("tableHtmlData", tableHtmlData) for td in tableHtmlData[0].find_all("th"): echo("td", td) # getting headers heading.append(cleanhtml(str(td.text))) echo("heading", heading) values = [] for row in tableHtmlData[1:]: echo("row", row) # remove any newlines and extra spaces from left and right rowValues = row.find_all("td") echo("rowValues", rowValues) values = [] for value in rowValues: values.append(cleanhtml(str(value.text))) echo("values", values) row = {} for counter, element in enumerate(values): echo("element", element) echo("counter", counter) echo("heading[counter]", heading[counter]) row[heading[counter]] = element analysis["data"].append(row) sinac["data"][index]["analisis"].append(analysis) echo("analysis", analysis) with open(outputFileSinac, "w") as sinacFile: sinacFile.write(json.dumps(sinac))

Comments are currently closed.

OPEN

Last posts

Water quality from SINAC in Spain