Skip to content

Commit

Permalink
Add analysis for CKAN tags and report generation in Makefile.
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmegginson committed Sep 7, 2022
1 parent 397132b commit 7ea21d7
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 8 deletions.
84 changes: 76 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,21 +1,89 @@
VENV=venv/bin/activate
OUTPUT_DIR=output
DATA=$(OUTPUT_DIR)/hdx-hashtag-stats.csv
SCRIPT=crawl-hdx.py
HASHTAG_SCRIPT=crawl-hdx.py
HASHTAG_DATA=$(OUTPUT_DIR)/hdx-hashtag-stats.csv
CKAN_TAG_SCRIPT=ckan-tags.py
CKAN_TAG_DATA=$(OUTPUT_DIR)/hxl-ckan-tags.csv

run: $(DATA)
DATA_SERIES_DATA=$(OUTPUT_DIR)/data-series.csv

$(DATA): $(VENV) $(SCRIPT)
. $(VENV) && mkdir -p $(OUTPUT_DIR) && python3 $(SCRIPT) > $@
REPORTS=$(OUTPUT_DIR)/report-hashtags-by-data-series.csv \
$(OUTPUT_DIR)/report-tagspecs-by-data-series.csv \
$(OUTPUT_DIR)/report-hashtags-by-org.csv \
$(OUTPUT_DIR)/report-tagspecs-by-org.csv

all: hashtag-data ckan-tags

hashtag-data: $(HASHTAG_DATA)

ckan-tags: $(CKAN_TAG_DATA)

data-series: $(DATA_SERIES_DATA)

reports: $(REPORTS)

$(HASHTAG_DATA): $(VENV) $(HASHTAG_SCRIPT)
. $(VENV) && python3 $(HASHTAG_SCRIPT) > $@

$(CKAN_TAG_DATA): $(VENV) $(CKAN_TAG_SCRIPT) $(DATA_SERIES_DATA)
. $(VENV) && python3 $(CKAN_TAG_SCRIPT) | hxlmerge -k meta+dataset -t meta+hash -m $(DATA_SERIES_DATA) > $@

$(DATA_SERIES_DATA): $(VENV) $(HASHTAG_DATA)
. $(VENV) && cat $(HASHTAG_DATA) | hxlcut -i meta+dataset,meta+hash | hxlsort -t meta+hash | hxldedup -t meta+dataset > $@

# Reports

$(OUTPUT_DIR)/report-hashtags-by-data-series.csv: $(HASHTAG_DATA) $(DATA_SERIES_DATA)
. $(VENV) && cat $(HASHTAG_DATA) \
| hxlcut -i meta+tag,meta+dataset \
| hxlmerge -m $(DATA_SERIES_DATA) -k meta+dataset -t meta+hash \
| hxldedup -t meta+tag,meta+hash \
| hxlcount -t meta+tag \
| hxlsort -r -t meta+count \
> $@

$(OUTPUT_DIR)/report-tagspecs-by-data-series.csv: $(HASHTAG_DATA) $(DATA_SERIES_DATA)
. $(VENV) && cat $(HASHTAG_DATA) \
| hxlcut -i meta+tagspec,meta+dataset \
| hxlmerge -m $(DATA_SERIES_DATA) -k meta+dataset -t meta+hash \
| hxldedup -t meta+tagspec,meta+hash \
| hxlcount -t meta+tagspec \
| hxlsort -r -t meta+count \
> $@

$(OUTPUT_DIR)/report-hashtags-by-org.csv: $(HASHTAG_DATA)
. $(VENV) && cat $(HASHTAG_DATA) \
| hxlcut -i meta+tag,org \
| hxldedup -t meta+tag,org \
| hxlcount -t meta+tag \
| hxlsort -r -t meta+count \
> $@

$(OUTPUT_DIR)/report-tagspecs-by-org.csv: $(HASHTAG_DATA)
. $(VENV) && cat $(HASHTAG_DATA) \
| hxlcut -i meta+tagspec,org \
| hxldedup -t meta+tagspec,org \
| hxlcount -t meta+tagspec \
| hxlsort -r -t meta+count \
> $@


# Admin

$(VENV):
rm -rf venv && python3 -m venv venv && pip3 install -r requirements.txt

$(OUTPUT_DIR):
mkdir -p $(OUTPUT_DIR)

sync:
git checkout main && git pull origin main && git push origin main

clean:
clean-all: clean-reports clean-venv

clean-reports:
rm -f $(REPORTS)

clean-venv:
rm -rf venv

real-clean: clean
rm -rf output && mkdir output
45 changes: 45 additions & 0 deletions ckan-tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
""" Find tags correlated with HXL datasets
"""

import ckancrawler, csv, sys

URL = 'https://data.humdata.org'
""" HDX URL
"""

SEARCH_FQ = 'vocab_Topics:hxl'
""" CKAN tag for HXL
"""

USER_AGENT = 'HDX-Developer-2015'
""" User agent to pass to CKAN (for analytics)
"""

crawler = ckancrawler.Crawler(URL, delay=0, user_agent=USER_AGENT)

output = csv.writer(sys.stdout)

output.writerow([
"Dataset",
"Organisation",
"CKAN tag",
])

output.writerow([
"#meta+dataset",
"#org+provider",
"#meta+tag",
])

for dataset in crawler.packages(fq=SEARCH_FQ):

for tag in dataset['tags']:
if tag['name'] != 'hxl':
output.writerow([
dataset['name'],
dataset['organization']['name'],
tag['name'],
])

sys.exit(0)

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
ckanapi
ckancrawler
libhxl>=4.7
requests

0 comments on commit 7ea21d7

Please sign in to comment.