Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ericjeangirard committed Nov 18, 2021
1 parent de65f57 commit b57b1f4
Show file tree
Hide file tree
Showing 27 changed files with 1,370 additions and 1 deletion.
25 changes: 25 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Python
__pycache__
*.pyc
*.pyi
venv
.cache
notebooks/*
tests/*

# IDE
.idea

# Logs
*.log

# Build
dist
*.egg-info

# Git
.git/
.github

# Documentation
doc
38 changes: 38 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
FROM ubuntu:18.04

RUN apt-get update \
&& apt-get install -y wget \
gnupg2

RUN wget -qO - https://www.mongodb.org/static/pgp/server-3.4.asc | apt-key add -

RUN echo "deb [ arch=amd64,arm64 ] http://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/3.4 multiverse" | tee /etc/apt/sources.list.d/mongodb-org-3.4.list

RUN apt-get update && apt-get install -y --no-install-recommends \
python3.6 \
python3-pip \
libpython3.6 \
jq \
mongodb-org \
locales \
locales-all \
python3-setuptools \
g++ \
python3-dev \
npm \
curl \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

WORKDIR /src

ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8

COPY requirements.txt /src/requirements.txt
RUN pip3 install --upgrade pip
RUN pip3 install -r requirements.txt --proxy=${HTTP_PROXY}

COPY . /src
34 changes: 34 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
DOCKER_IMAGE_NAME=dataesr/scientific-tagger
CURRENT_VERSION=$(shell cat project/__init__.py | cut -d "'" -f 2)

start:
@echo Scientific Tagger starting...
docker-compose up -d
@echo Scientific Tagger started http://localhost:5004

stop:
@echo Matcher stopping...
docker-compose down
@echo Matcher stopped

release:
echo "__version__ = '$(VERSION)'" > project/__init__.py
git commit -am '[release] version $(VERSION)'
git tag $(VERSION)
@echo If everything is OK, you can push with tags i.e. git push origin main --tags

docker-build:
@echo Building a new docker image
docker build -t $(DOCKER_IMAGE_NAME):$(CURRENT_VERSION) -t $(DOCKER_IMAGE_NAME):latest .
@echo Docker image built

docker-push:
@echo Pushing a new docker image
docker push $(DOCKER_IMAGE_NAME):$(CURRENT_VERSION)
docker push $(DOCKER_IMAGE_NAME):latest
@echo Docker image pushed

python-build:
@echo Building a python package
python setup.py sdist
@echo Python package built
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
# harvest-hal
# Download pdf

2 changes: 2 additions & 0 deletions docker_push_ext.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sudo docker build -f Dockerfile -t dataesr/harvest-hal -t dataesr/harvest-hal:0.0.0 .
sudo docker push -a dataesr/harvest-hal
33 changes: 33 additions & 0 deletions manage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import redis
from rq import Connection, Worker
from flask.cli import FlaskGroup

from project.server import create_app
from werkzeug.serving import WSGIRequestHandler

app = create_app()
cli = FlaskGroup(create_app=create_app)


@cli.command()
def test():
"""Runs the unit tests without test coverage."""
tests = unittest.TestLoader().discover("project/tests", pattern="test*.py")
result = unittest.TextTestRunner(verbosity=2).run(tests)
if result.wasSuccessful():
return 0
return 1


@cli.command("run_worker")
def run_worker():
redis_url = app.config["REDIS_URL"]
redis_connection = redis.from_url(redis_url)
with Connection(redis_connection):
worker = Worker(app.config["QUEUES"])
worker.work()


if __name__ == "__main__":
WSGIRequestHandler.protocol_version = "HTTP/1.1"
cli()
218 changes: 218 additions & 0 deletions notebooks/.ipynb_checkpoints/Untitled-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 77,
"id": "fd46ad3a",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from urllib.parse import quote_plus"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5cde00fd",
"metadata": {},
"outputs": [],
"source": [
"year = 2000\n",
"nb_rows = 100\n",
"cursor='*'\n",
"data = []\n",
"while True:\n",
" url = f'https://api.archives-ouvertes.fr/search/?q=*:*&wt=json&fl=*&fq=submittedDateY_i:[{year}%20TO%20{year}]&sort=docid asc&rows={nb_rows}&cursorMark={cursor}'\n",
" r = requests.get(url)\n",
" res = r.json()\n",
" new_cursor = quote_plus(res['nextCursorMark'])\n",
" data += res['response']['docs']\n",
" if new_cursor == cursor:\n",
" break\n",
" cursor = new_cursor"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "53d07165",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1675"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(data)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "6353259d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'error': {'metadata': ['error-class',\n",
" 'org.apache.solr.common.SolrException',\n",
" 'root-error-class',\n",
" 'java.lang.IllegalArgumentException'],\n",
" 'msg': \"Unable to parse 'cursorMark' after totem: value must either be '*' or the 'nextCursorMark' returned by a previous search: AoFa12c\",\n",
" 'code': 400}}"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r.json()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "a83e23c4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'https://api.archives-ouvertes.fr/search/?q=*:*&wt=json&fl=*&fq=submittedDateY_i:[1950%20TO%201950]&sort=docid asc&rows=1&cursorMark=AoFQrnI='"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"url"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "31dfec4b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['numFound', 'start', 'docs'])"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r.json()['response'].keys()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "fa429c97",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['response', 'nextCursorMark'])"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r.json().keys()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "8f368b0a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"160"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r.json()['response']['numFound']"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "295c75e3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'AoFQrnI='"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r.json()['nextCursorMark']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "58daec78",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit b57b1f4

Please sign in to comment.