feat: integrate BERTopic for topic modeling

Added dependencies for BERTopic, llvmlite, numba, and Neo4j. Implemented incremental topic modeling with BERTopic in main.py, including model initialization, data loading, fitting, saving, and updating topics in Neo4j. Added FastAPI and ConnectRPC
BoredLabsHQ · Oct 30, 2024 · fc269a5 · fc269a5
1 parent fd9cb88
commit fc269a5
Show file tree

Hide file tree

Showing 19 changed files with 2,641 additions and 224 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,15 +1,28 @@
 name: CI
 
 on:
-  push:
-    branches: [ main ]
   pull_request:
+    types: [ opened, synchronize, reopened, ready_for_review ]
     branches: [ main ]
 
 jobs:
   build:
     name: Build and Test
     runs-on: ubuntu-latest
+    if: ${{ !github.event.pull_request.draft }}
+    services:
+      neo4j:
+        image: neo4j:5
+        ports:
+          - 7687:7687  # Bolt port
+          - 7474:7474  # HTTP port
+        env:
+          NEO4J_AUTH: neo4j:${{ secrets.NEO4J_PASSWORD }}
+        options: >-
+          --health-cmd="curl -f http://localhost:7474 || exit 1"
+          --health-interval=10s
+          --health-timeout=5s
+          --health-retries=5
 
     strategy:
       matrix:
@@ -32,4 +45,8 @@ jobs:
         run: poetry run pre-commit run --all-files --show-diff-on-failure
 
       - name: Run tests
+        env:
+          NEO4J_URI: bolt://localhost:7687
+          NEO4J_USER: neo4j
+          NEO4J_PASSWORD: ${{ secrets.NEO4J_PASSWORD }}
         run: poetry run pytest
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -1,17 +1,14 @@
 name: "CodeQL"
 
 on:
-  push:
-    branches: [ main ]
   pull_request:
     branches: [ main ]
-  schedule:
-    - cron: '35 6 * * 3'  # Runs at 06:35 every Wednesday
 
 jobs:
   analyze:
     name: Analyze
     runs-on: ubuntu-latest
+    if: ${{ !github.event.pull_request.draft }}
 
     permissions:
       actions: read

diff --git a/.gitignore b/.gitignore
@@ -164,3 +164,4 @@ cython_debug/
 .idea/Concord.iml
 .idea/modules.xml
 .idea/vcs.xml
+/.idea/developer-tools.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,19 +1,20 @@
 repos:
   - repo: local
     hooks:
-      - id: black
-        name: black
-        entry: poetry run black
+      - id: yapf
+        name: yapf
+        entry: poetry run yapf
         language: system
-        types: [ python ]
-        args: [ --check, --diff ]
+        pass_filenames: false
+        args: [ "-i", "-r", "concord/", "tests/" ]
       - id: flake8
         name: flake8
         entry: poetry run flake8
         language: system
-        types: [ python ]
+        pass_filenames: false
+        args: [ "concord/", "tests/" ]
       - id: pytest
         name: pytest
         entry: poetry run pytest
         language: system
-        pass_filenames: false
+        pass_filenames: false
diff --git a/README.md b/README.md
@@ -1 +1,149 @@
 # Concord
+
+Concord is a Python project that leverages FastAPI, Neo4j, and BERTopic for advanced text analysis. It provides a
+platform for analyzing and visualizing text data using state-of-the-art machine learning techniques.
+
+## Table of Contents
+
+- [Prerequisites](#prerequisites)
+- [Installation](#installation)
+    - [Clone the Repository](#clone-the-repository)
+    - [Set Up Dependencies](#set-up-dependencies)
+        - [Debian-based Systems](#debian-based-systems)
+        - [Windows](#windows)
+- [Running the Application](#running-the-application)
+    - [Start Docker Containers](#start-docker-containers)
+    - [Run Pre-commit Hooks and Tests](#run-pre-commit-hooks-and-tests)
+- [Contributing](#contributing)
+- [License](#license)
+
+## Prerequisites
+
+- **Python 3.12+**
+- **Poetry** for dependency management
+- **Docker** and **Docker Compose**
+- **Git**
+
+## Installation
+
+### Clone the Repository
+
+```bash
+git clone https://github.com/yourusername/concord.git
+cd concord
+```
+
+### Set Up Dependencies
+
+#### Debian-based Systems
+
+1. **Update Package Lists**
+
+   ```bash
+   sudo apt update
+   ```
+
+2. **Install Required Packages**
+
+   ```bash
+   sudo apt install -y software-properties-common curl git
+   ```
+
+3. **Install Python 3.12**
+
+   Add the Deadsnakes PPA and install Python 3.12:
+
+   ```bash
+   sudo add-apt-repository ppa:deadsnakes/ppa
+   sudo apt update
+   sudo apt install -y python3.12 python3.12-venv python3.12-dev
+   ```
+
+4. **Install Poetry**
+
+   ```bash
+   curl -sSL https://install.python-poetry.org | python3 -
+   echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc
+   source ~/.bashrc
+   ```
+
+5. **Install Docker and Docker Compose**
+
+   ```bash
+   sudo apt install -y docker.io docker-compose
+   sudo systemctl start docker
+   sudo systemctl enable docker
+   sudo usermod -aG docker $USER
+   ```
+
+   Log out and log back in for the group changes to take effect.
+
+6. **Install Project Dependencies**
+
+   ```bash
+   poetry install
+   poetry run pre-commit install
+   ```
+
+#### Windows
+
+1. **Install Python 3.12**
+
+   Download and install Python 3.12 from the [official website](https://www.python.org/downloads/windows/). During
+   installation, make sure to check the box **"Add Python to PATH"**.
+
+2. **Install Git**
+
+   Download and install Git from the [official website](https://git-scm.com/download/win).
+
+3. **Install Poetry**
+
+   Open Command Prompt or PowerShell and run:
+
+   ```powershell
+   (Invoke-WebRequest -Uri https://install.python-poetry.org -UseBasicParsing).Content | python -
+   ```
+
+   Add Poetry to your PATH by adding the following line to your PowerShell profile:
+
+   ```powershell
+   $env:Path += ";$env:APPDATA\Python\Scripts"
+   ```
+
+4. **Install Docker Desktop**
+
+   Download and install Docker Desktop from the [official website](https://www.docker.com/products/docker-desktop).
+   Ensure that it is running before proceeding.
+
+5. **Install Project Dependencies**
+
+   ```powershell
+   poetry install
+   poetry run pre-commit install
+   ```
+
+## Running the Application
+
+### Start Docker Containers
+
+Set up a temporary Neo4j database:
+
+```bash
+docker-compose up -d
+```
+
+> **Note:** On Windows, ensure Docker Desktop is running and has sufficient resources allocated.
+
+### Run Pre-commit Hooks and Tests
+
+```bash
+poetry run pre-commit run -a
+```
+
+## Contributing
+
+Contributions are welcome! Please open an issue or submit a pull request for any improvements or suggestions.
+
+## License
+
+This project is licensed under the [MIT License](LICENSE).
diff --git a/concord/bert/__init__.py b/concord/bert/__init__.py
diff --git a/concord/bert/bert.py b/concord/bert/bert.py
@@ -0,0 +1,44 @@
+# bert.py
+
+import os
+
+import joblib
+from bertopic import BERTopic
+from sentence_transformers import SentenceTransformer
+
+
+def initialize_model():
+    """
+    Initialize the BERTopic model.
+    You can customize the model with different parameters as needed.
+    """
+    # Using a specific embedding model for better performance
+    embedding_model = SentenceTransformer("all-mpnet-base-v2")
+
+    topic_model = BERTopic(
+        embedding_model=embedding_model,
+        verbose=True,
+        # You can add more parameters here
+    )
+    return topic_model
+
+
+def save_model(model, path):
+    """
+    Save the BERTopic model to disk.
+    """
+    joblib.dump(model, path)
+    print(f"Model saved to {path}")
+
+
+def load_model(path):
+    """
+    Load the BERTopic model from disk.
+    """
+    if os.path.exists(path):
+        model = joblib.load(path)
+        print(f"Model loaded from {path}")
+        return model
+    else:
+        print(f"No existing model found at {path}. Initializing a new model.")
+        return None
diff --git a/concord/concord.py b/concord/concord.py
@@ -0,0 +1,84 @@
+# concord.py
+
+from bertopic import BERTopic
+from sklearn.datasets import fetch_20newsgroups
+
+from bert.bert import load_model, save_model
+
+# Configuration
+MODEL_SAVE_PATH = "bertopic_model.pkl"
+INITIAL_BATCH_SIZE = 100  # Number of documents in the initial batch
+BATCH_SIZE = 50  # Number of documents per new batch
+TOTAL_BATCHES = 10  # Total number of batches to simulate
+
+
+def load_data(batch_number, batch_size):
+    """
+    Simulate loading a batch of data.
+    For demonstration, we're using the 20 Newsgroups dataset.
+    In a real-world scenario, replace this with your data loading mechanism.
+    """
+    newsgroups = fetch_20newsgroups(subset="all",
+                                    remove=("headers", "footers", "quotes"))
+    start = batch_number * batch_size
+    end = start + batch_size
+    if start >= len(newsgroups.data):
+        return []
+    return newsgroups.data[start:end]
+
+
+def setup():
+    # Load existing model or initialize a new one
+    topic_model = load_model(MODEL_SAVE_PATH)
+
+    # Initialize data storage
+    all_documents = []
+
+    if topic_model is None:
+        # Initial batch
+        print("Processing initial batch...")
+        initial_batch = load_data(batch_number=0,
+                                  batch_size=INITIAL_BATCH_SIZE)
+        all_documents.extend(initial_batch)
+
+        # Fit the model with the initial batch
+        topic_model = BERTopic()
+        topics, probs = topic_model.fit_transform(all_documents)
+
+        # Save the model
+        save_model(topic_model, MODEL_SAVE_PATH)
+    else:
+        # If model exists, load existing documents if stored
+        # For simplicity, we're not storing all_documents. In a real application, consider storing them.
+        print("Existing model loaded. Starting incremental updates...")
+        # Optionally, load existing documents from a file or database
+        # all_documents = load_existing_documents()
+
+    # Simulate incremental updates with new batches
+    for batch_num in range(1, TOTAL_BATCHES + 1):
+        print(f"\nProcessing batch {batch_num}...")
+        new_batch = load_data(batch_number=batch_num, batch_size=BATCH_SIZE)
+
+        if not new_batch:
+            print("No more data to process.")
+            break
+
+        all_documents.extend(new_batch)
+
+        # Refit the model with all documents (existing + new)
+        print("Refitting the BERTopic model with the updated dataset...")
+        topic_model = BERTopic()
+        topics, probs = topic_model.fit_transform(all_documents)
+
+        # Optionally, analyze topics
+        print(
+            f"Number of topics after batch {batch_num}: {len(set(topics)) - (1 if -1 in topics else 0)}"
+        )
+
+        # Save the updated model
+        save_model(topic_model, MODEL_SAVE_PATH)
+
+        # Optional: Integrate with Neo4j to store/update topics
+        # store_topics_in_neo4j(topic_model, batch_num)
+
+    print("\nTopic modeling completed.")
diff --git a/concord/fast_api.py b/concord/fast_api.py
@@ -0,0 +1,10 @@
+# fast_api.py
+
+from fastapi import FastAPI
+
+app = FastAPI(title="Concord API", version="1.0")
+
+
+@app.get("/")
+def read_root():
+    return {"message": "Welcome to Concord API"}
diff --git a/concord/graph/__init__.py b/concord/graph/__init__.py