From 7ea21d70bab2e52b212cff7a0ce6ab7714859857 Mon Sep 17 00:00:00 2001 From: David Megginson Date: Wed, 7 Sep 2022 10:41:35 -0400 Subject: [PATCH] Add analysis for CKAN tags and report generation in Makefile. --- Makefile | 84 +++++++++++++++++++++++++++++++++++++++++++----- ckan-tags.py | 45 ++++++++++++++++++++++++++ requirements.txt | 1 + 3 files changed, 122 insertions(+), 8 deletions(-) create mode 100644 ckan-tags.py diff --git a/Makefile b/Makefile index 9bad52b..aaf37fb 100644 --- a/Makefile +++ b/Makefile @@ -1,21 +1,89 @@ VENV=venv/bin/activate OUTPUT_DIR=output -DATA=$(OUTPUT_DIR)/hdx-hashtag-stats.csv -SCRIPT=crawl-hdx.py +HASHTAG_SCRIPT=crawl-hdx.py +HASHTAG_DATA=$(OUTPUT_DIR)/hdx-hashtag-stats.csv +CKAN_TAG_SCRIPT=ckan-tags.py +CKAN_TAG_DATA=$(OUTPUT_DIR)/hxl-ckan-tags.csv -run: $(DATA) +DATA_SERIES_DATA=$(OUTPUT_DIR)/data-series.csv -$(DATA): $(VENV) $(SCRIPT) - . $(VENV) && mkdir -p $(OUTPUT_DIR) && python3 $(SCRIPT) > $@ +REPORTS=$(OUTPUT_DIR)/report-hashtags-by-data-series.csv \ + $(OUTPUT_DIR)/report-tagspecs-by-data-series.csv \ + $(OUTPUT_DIR)/report-hashtags-by-org.csv \ + $(OUTPUT_DIR)/report-tagspecs-by-org.csv + +all: hashtag-data ckan-tags + +hashtag-data: $(HASHTAG_DATA) + +ckan-tags: $(CKAN_TAG_DATA) + +data-series: $(DATA_SERIES_DATA) + +reports: $(REPORTS) + +$(HASHTAG_DATA): $(VENV) $(HASHTAG_SCRIPT) + . $(VENV) && python3 $(HASHTAG_SCRIPT) > $@ + +$(CKAN_TAG_DATA): $(VENV) $(CKAN_TAG_SCRIPT) $(DATA_SERIES_DATA) + . $(VENV) && python3 $(CKAN_TAG_SCRIPT) | hxlmerge -k meta+dataset -t meta+hash -m $(DATA_SERIES_DATA) > $@ + +$(DATA_SERIES_DATA): $(VENV) $(HASHTAG_DATA) + . $(VENV) && cat $(HASHTAG_DATA) | hxlcut -i meta+dataset,meta+hash | hxlsort -t meta+hash | hxldedup -t meta+dataset > $@ + +# Reports + +$(OUTPUT_DIR)/report-hashtags-by-data-series.csv: $(HASHTAG_DATA) $(DATA_SERIES_DATA) + . $(VENV) && cat $(HASHTAG_DATA) \ + | hxlcut -i meta+tag,meta+dataset \ + | hxlmerge -m $(DATA_SERIES_DATA) -k meta+dataset -t meta+hash \ + | hxldedup -t meta+tag,meta+hash \ + | hxlcount -t meta+tag \ + | hxlsort -r -t meta+count \ + > $@ + +$(OUTPUT_DIR)/report-tagspecs-by-data-series.csv: $(HASHTAG_DATA) $(DATA_SERIES_DATA) + . $(VENV) && cat $(HASHTAG_DATA) \ + | hxlcut -i meta+tagspec,meta+dataset \ + | hxlmerge -m $(DATA_SERIES_DATA) -k meta+dataset -t meta+hash \ + | hxldedup -t meta+tagspec,meta+hash \ + | hxlcount -t meta+tagspec \ + | hxlsort -r -t meta+count \ + > $@ + +$(OUTPUT_DIR)/report-hashtags-by-org.csv: $(HASHTAG_DATA) + . $(VENV) && cat $(HASHTAG_DATA) \ + | hxlcut -i meta+tag,org \ + | hxldedup -t meta+tag,org \ + | hxlcount -t meta+tag \ + | hxlsort -r -t meta+count \ + > $@ + +$(OUTPUT_DIR)/report-tagspecs-by-org.csv: $(HASHTAG_DATA) + . $(VENV) && cat $(HASHTAG_DATA) \ + | hxlcut -i meta+tagspec,org \ + | hxldedup -t meta+tagspec,org \ + | hxlcount -t meta+tagspec \ + | hxlsort -r -t meta+count \ + > $@ + + +# Admin $(VENV): rm -rf venv && python3 -m venv venv && pip3 install -r requirements.txt +$(OUTPUT_DIR): + mkdir -p $(OUTPUT_DIR) + sync: git checkout main && git pull origin main && git push origin main -clean: +clean-all: clean-reports clean-venv + +clean-reports: + rm -f $(REPORTS) + +clean-venv: rm -rf venv -real-clean: clean - rm -rf output && mkdir output diff --git a/ckan-tags.py b/ckan-tags.py new file mode 100644 index 0000000..455cf53 --- /dev/null +++ b/ckan-tags.py @@ -0,0 +1,45 @@ +""" Find tags correlated with HXL datasets +""" + +import ckancrawler, csv, sys + +URL = 'https://data.humdata.org' +""" HDX URL +""" + +SEARCH_FQ = 'vocab_Topics:hxl' +""" CKAN tag for HXL +""" + +USER_AGENT = 'HDX-Developer-2015' +""" User agent to pass to CKAN (for analytics) +""" + +crawler = ckancrawler.Crawler(URL, delay=0, user_agent=USER_AGENT) + +output = csv.writer(sys.stdout) + +output.writerow([ + "Dataset", + "Organisation", + "CKAN tag", +]) + +output.writerow([ + "#meta+dataset", + "#org+provider", + "#meta+tag", +]) + +for dataset in crawler.packages(fq=SEARCH_FQ): + + for tag in dataset['tags']: + if tag['name'] != 'hxl': + output.writerow([ + dataset['name'], + dataset['organization']['name'], + tag['name'], + ]) + +sys.exit(0) + diff --git a/requirements.txt b/requirements.txt index 61c8d3a..d28640a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ ckanapi +ckancrawler libhxl>=4.7 requests