harvard-lil · jcushman · Apr 5, 2024
@@ -1,8 +1,8 @@
-FROM python:3.7-buster
+FROM python:3.11.9-bookworm
 ENV PYTHONUNBUFFERED 1
 
 # Enable apt-get -t buster-backports
-RUN echo 'deb http://deb.debian.org/debian buster-backports main' > /etc/apt/sources.list.d/backports.list
+#RUN echo 'deb http://deb.debian.org/debian buster-backports main' > /etc/apt/sources.list.d/backports.list
 
 # Get build dependencies and packages required by the app
 # FIRST LINE:
@@ -22,9 +22,9 @@ RUN echo 'deb http://deb.debian.org/debian buster-backports main' > /etc/apt/sou
 # htmltidy for fastcase ingest
 RUN apt-get update \
     && apt-get install -y redis-server postgresql-client libtiff-tools pdftk \
-    && apt-get install -y librocksdb5.17 librocksdb-dev libsnappy-dev zlib1g-dev libbz2-dev libgflags-dev liblz4-dev rocksdb-tools \
+    && apt-get install -y librocksdb7.8 librocksdb-dev libsnappy-dev zlib1g-dev libbz2-dev libgflags-dev liblz4-dev rocksdb-tools \
     && apt-get install -y libx11-xcb1 libxtst6 libgtk-3-0 libnss3 \
-    && echo libhyperscan5 libhyperscan/cpu-ssse3 boolean true | debconf-set-selections && apt-get -t buster-backports install -y libhyperscan-dev \
+    && echo libhyperscan5 libhyperscan/cpu-ssse3 boolean true | debconf-set-selections && apt-get install -y libhyperscan-dev \
     && apt-get install -y tidy \
     && apt-get install -y libdrm2 libgbm-dev \
     && apt-get install --no-install-recommends libdbus-glib-1-2
@@ -33,7 +33,7 @@ RUN apt-get update \
 RUN mkdir /app
 WORKDIR /app
 COPY requirements.txt /app
-RUN pip install pip==21.3.1 \
+RUN pip install pip==24.0 \
     && pip install -r requirements.txt --src /usr/local/src \
     && rm requirements.txt
 
@@ -44,9 +44,8 @@ RUN echo "--modules-folder /node_modules" > /.yarnrc
 COPY package.json /app
 COPY yarn.lock /app
 # pin node version -- see https://github.com/nodesource/distributions/issues/33
-RUN curl -o nodejs.deb https://deb.nodesource.com/node_14.x/pool/main/n/nodejs/nodejs_14.21.3-1nodesource1_amd64.deb \
-    && dpkg -i ./nodejs.deb \
-    && rm nodejs.deb \
+RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash -  \
+    && apt-get install -y nodejs=20.12.0-1nodesource1 \
     && npm install -g [email protected] \
     && yarn install --frozen-lockfile \
     && rm package.json \

@@ -1,9 +1,7 @@
 from django.conf import settings
 from django.urls import path, re_path, include
 from django.views.generic import RedirectView, TemplateView
-from rest_framework import routers, permissions
-from drf_yasg.views import get_schema_view
-from drf_yasg import openapi
+from rest_framework import routers
 
 from capapi.views import api_views
 
@@ -27,32 +25,12 @@ def get(self, request, *args, **kwargs):
         return super().get(request, *args, **kwargs)
 router.APIRootView = FilteredAPIRootView
 
-
-schema_view = get_schema_view(
-    openapi.Info(
-        title="CAP API",
-        default_version='v1',
-        description="United States Caselaw",
-        terms_of_service="https://%s/terms" % settings.PARENT_HOST,
-        contact=openapi.Contact(url="https://%s/contact" % settings.PARENT_HOST),
-    ),
-    urlconf='capapi.api_urls',
-    validators=['flex', 'ssv'],
-    public=True,
-    permission_classes=(permissions.AllowAny,),
-)
-
 urlpatterns = [
     path('v1/', include(router.urls)),
     path('unstable/', include(unstable_router.urls)),
     # convenience pattern: catch all citations, redirect in CaseDocumentViewSet's retrieve
     re_path(r'^v1/cases/(?P<id>[0-9A-Za-z\s\.]+)/$', api_views.CaseDocumentViewSet.as_view({'get': 'retrieve'}), name='case-get-cite'),
 
-    ### Swagger/OpenAPI/ReDoc ###
-    re_path(r'^swagger(?P<format>\.json|\.yaml)$', schema_view.without_ui(cache_timeout=None), name='schema-json'),
-    re_path(r'^swagger/$', schema_view.with_ui('swagger', cache_timeout=None), name='schema-swagger-ui'),
-    re_path(r'^redoc/$', schema_view.with_ui('redoc', cache_timeout=None), name='schema-redoc'),
-
     path('robots.txt', TemplateView.as_view(template_name='robots_api.txt', content_type='text/plain'), name='robots_api'),
     path('', RedirectView.as_view(url='/v1/', permanent=False), name='api-root')
 ]

@@ -1,7 +1,6 @@
 from csv import DictReader
 from io import StringIO
 
-from flaky import flaky
 from rest_framework.request import Request
 from rest_framework.test import APIRequestFactory
 
@@ -583,10 +582,8 @@ def test_filter_reporter(client, reporter):
 
 # NGRAMS
 
-@flaky(max_runs=10)  # ngrammed_cases call to ngram_jurisdictions doesn't reliably work because it uses multiprocessing within pytest environment
 @pytest.mark.django_db(databases=['capdb'])
-def test_ngrams_api(client, request):
-    ngrammed_cases = request.getfixturevalue('ngrammed_cases')  # load fixture inside test so flaky() can catch errors
+def test_ngrams_api(client, ngrammed_cases):
 
     # check result counts when not filtering by jurisdiction
     json = client.get(api_reverse('ngrams-list'), {'q': 'one two'}).json()
@@ -600,31 +597,6 @@ def test_ngrams_api(client, request):
         'one two': {
             'jur1': [{'year': '2000', 'count': [1, 6], 'doc_count': [1, 2]}]}}
 
-    # check wildcard match
-    json = client.get(api_reverse('ngrams-list'), {'q': 'three *'}).json()
-    assert json['results'] == {
-        'three four': {
-            'total': [{'year': '2000', 'count': [1, 9], 'doc_count': [1, 3]}]},
-        "three don't": {
-            'total': [{'year': '2000', 'count': [2, 9], 'doc_count': [2, 3]}]}}
-
-
-# API SPECIFICATION ENDPOINTS
-@pytest.mark.django_db(databases=['capdb'])
-@pytest.mark.parametrize("url, content_type", [
-    (api_reverse("schema-swagger-ui"), 'text/html'),
-    (api_reverse("schema-json", args=['.json']), 'application/json'),
-    (api_reverse("schema-json", args=['.yaml']), 'application/yaml'),
-])
-def test_swagger(client, url, content_type):
-    response = client.get(url)
-    check_response(response, content_type=content_type)
-
-
-def test_redoc(client):
-    response = client.get(api_reverse("schema-redoc"))
-    check_response(response, content_type="text/html")
-
 
 # PAGINATION
 @pytest.mark.django_db(databases=['capdb'])

@@ -1,9 +1,6 @@
-import bisect
 import urllib
 import re
 from datetime import datetime
-from collections import defaultdict
-from pathlib import Path
 
 from django.utils.functional import partition
 from django_filters.utils import translate_validation
@@ -25,7 +22,6 @@
 from capapi.resources import api_request
 from capdb import models
 from capdb.models import CaseMetadata
-from capdb.storages import ngram_kv_store_ro
 from capweb.helpers import cache_func
 from scripts.helpers import alphanum_lower
 from user_data.models import UserHistory
@@ -232,10 +228,11 @@ class CaseDocumentViewSet(BaseDocumentViewSet):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.valid_query_fields = [
-            *[field.name for backend in self.query_filter_backends 
-                for field in backend().get_schema_fields(self)],
-            *[backend.search_param for backend in self.query_filter_backends 
-                if hasattr(backend, 'search_param')]
+            # get_schema_fields() doesn't work in python 3.11, so this would need another approach if we want it:
+            # *[field.name for backend in self.query_filter_backends
+            #     for field in backend().get_schema_fields(self)],
+            # *[backend.search_param for backend in self.query_filter_backends
+            #     if hasattr(backend, 'search_param')]
         ]
 
     def is_full_case_request(self):
@@ -414,27 +411,6 @@ def __init__(self, *args, **kwargs):
         self.jurisdiction_id_to_slug = dict(models.Jurisdiction.objects.values_list('pk', 'slug'))
         self.jurisdiction_id_to_slug[None] = 'total'
         self.jurisdiction_slug_to_id = {v:k for k,v in self.jurisdiction_id_to_slug.items()}
-        self.totals_by_jurisdiction_year_length = self.load_totals()
-
-    @staticmethod
-    def load_totals():
-        # populate self.totals_by_jurisdiction_year_length, a mapping of jurisdiction-year-length to counts, like:
-        #   {
-        #       (<jur_id>, <year>, <length>): (<word count>, <document count>),
-        #   }
-        if not Path(ngram_kv_store_ro.db_path()).exists():
-            return {}
-        totals_by_jurisdiction_year_length = defaultdict(lambda: [0,0])
-        for k, v in ngram_kv_store_ro.get_prefix(b'totals', packed=True):
-            jur, year, n = ngram_kv_store_ro.unpack(k[len(b'totals'):])
-            totals_by_jurisdiction_year_length[(jur, year, n)] = v
-            for total in (
-                totals_by_jurisdiction_year_length[(None, year, n)],
-                totals_by_jurisdiction_year_length[(None, None, n)]
-            ):
-                total[0] += v[0]
-                total[1] += v[1]
-        return totals_by_jurisdiction_year_length
 
     @staticmethod
     def query_params_are_filters(query_body):
@@ -455,27 +431,6 @@ def query_params_are_filters(query_body):
 
         return True
 
-
-    def get_query_data_from_api_query(self, q):
-        # given an `api(...)` query, return a structured list of filters and aggregations
-        # validate whether a case ID exists in the corpus
-        # check if the supplied item is a valid case id
-        if not q or not (q.startswith('api(') and q.endswith(')')):
-            return False
-
-        query_body = None
-        try:
-            query_body = QueryDict(q[4:-1], mutable=True)
-        except Exception:
-            raise ValidationError({'error': 'Query is not in a URL parameter format.'})
-
-        self.query_params_are_filters(query_body)
-
-        query_body['page_size'] = 1
-        query_body['facet'] = 'decision_date'
-
-        return query_body
-
     @staticmethod
     def create_timeline_entries(bucket_entries, total_dict, jurisdiction):
         # generate timeline datapoint given an elasticsearch query result
@@ -561,117 +516,24 @@ def list(self, request, *args, **kwargs):
         if not q:
             return Response({})
 
-        # check if we're querying for a case as opposed to a word
-        # default to keyword search if value is empty 
-        api_query_body = self.get_query_data_from_api_query(q)
-
-        # prepend word count as first byte. only applicable for n-grams
-        words = q.lower().split(' ')[:3]  # use first 3 words
-        q_len = len(words)
-        q_sig = bytes([q_len]) + ' '.join(words).encode('utf8')
-
-        if api_query_body:
-            try:
-                results = self.get_citation_data(request, api_query_body, q)
-            except filters.TooManyJoinedResultsException:
-                raise ValidationError({'error': 'The set of cases to cite to is too large. Consider \
-                    narrowing this group to contain less than 20000 cases.'})
-            pairs = []
-        elif q_sig.endswith(b' *'):
-            results = {}
-            # wildcard search
-            pairs = ngram_kv_store_ro.get_prefix(q_sig[:-1], packed=True)
-        else:
-            results = {}
-            # non-wildcard search
-            value = ngram_kv_store_ro.get(q_sig, packed=True)
-            if value:
-                pairs = [(q_sig, value)]
-            else:
-                pairs = []
-
-        ## format results
-        if pairs:
-            # prepare jurisdiction_filter from jurisdiction= query param
-            jurisdictions = request.GET.getlist('jurisdiction')
-            if '*' in jurisdictions:
-                jurisdiction_filter = None
-            else:
-                jurisdiction_filter = set(self.jurisdiction_slug_to_id[j] for j in jurisdictions if j in self.jurisdiction_slug_to_id)
-                if not jurisdiction_filter:
-                    jurisdiction_filter.add(None)
-
-            # prepare year_filter from year= query param
-            year_filter = set()
-            for year in request.GET.getlist('year'):
-                if year.isdigit():
-                    year_filter.add(int(year))
-
-            # get top 10 pairs
-            top_pairs = []
-            for gram, data in pairs:
-                total_jur = data[None]
-                sort_count = total_jur[None][0]
-                bisect.insort_right(top_pairs, (sort_count, gram, data))
-                top_pairs = top_pairs[-10:]
-
-            # Reformat stored gram data for delivery.
-            # top_pairs will look like:
-            #   [
-            #     (<sort_count>, b'<wordcount><gram>', {
-            #       <jur_id>: [
-            #         <year - 1900>, <instance_count>, <document_count>,
-            #         <year - 1900>, <instance_count>, <document_count>, ...
-            #     ]),
-            #  ]
-            # this reformats to:
-            #  {
-            #    <jurisdiction slug>: [
-            #      {
-            #        'year': <year>,
-            #        'count': [<instance_count>, <total instances>],
-            #        'doc_count': [<instance_count>, <total instances>],
-            #      }
-            #    ]
-            #  }
-            for _, gram, data in reversed(top_pairs):
-                out = {}
-                for jur_id, years in data.items():
-
-                    # apply jurisdiction_filter
-                    if jurisdiction_filter and jur_id not in jurisdiction_filter:
-                        continue
-
-                    years_out = []
-                    jur_slug = self.jurisdiction_id_to_slug[jur_id]
-                    if jur_id is None:
-                        years = [i for k, v in years.items() for i in [k]+v]
-                    for i in range(0, len(years), 3):
-                        year, count, doc_count = years[i:i+3]
-
-                        # filter out total
-                        if year is None:
-                            continue
-
-                        # years will be -1900 for msgpack compression -- add 1900 back in
-                        year += 1900
-
-                        # apply year filter
-                        if year_filter and year not in year_filter:
-                            continue
-
-                        totals = self.totals_by_jurisdiction_year_length[(jur_id, year, q_len)]
-                        years_out.append({
-                            "year": str(year) if year else "total",
-                            "count": [count, totals[0]],
-                            "doc_count": [doc_count, totals[1]]
-                        })
-
-                    years_out.sort(key=lambda y: y["year"])
-                    out[jur_slug] = years_out
-
-                if out:
-                    results[gram[1:].decode('utf8')] = out
+        if q.startswith('api(') and q.endswith(')'):
+            q = q[4:-1]
+
+        try:
+            api_query_body = QueryDict(q, mutable=True)
+        except Exception:
+            raise ValidationError({'error': 'Query is not in a URL parameter format.'})
+
+        self.query_params_are_filters(api_query_body)
+
+        api_query_body['page_size'] = 1
+        api_query_body['facet'] = 'decision_date'
+
+        try:
+            results = self.get_citation_data(request, api_query_body, q)
+        except filters.TooManyJoinedResultsException:
+            raise ValidationError({'error': 'The set of cases to cite to is too large. Consider \
+                narrowing this group to contain less than 20000 cases.'})
 
         paginated = {
             "count": len(results),

@@ -6,7 +6,7 @@
 
 # Create a command to autoreload celery beat for elasticsearch indexing
 def autoreload_celery(*args, **kwargs):
-    celery_worker_cmd = "celery worker -A config.celery.app -c 1 -B --uid=nobody --gid=nogroup"
+    celery_worker_cmd = "celery -A config.celery.app -c 1 -B --uid=nobody --gid=nogroup worker"
     print("Kill lingering celery worker...")
     subprocess.run(shlex.split(f'pkill -f "{celery_worker_cmd}"'))
     print("Start celery worker...")