diff --git a/.github/workflows/mkdocs-deploy.yml b/.github/workflows/mkdocs-deploy.yml
deleted file mode 100644
index f17ea8d..0000000
--- a/.github/workflows/mkdocs-deploy.yml
+++ /dev/null
@@ -1,25 +0,0 @@
-name: mkdocs-deploy
-on:
- push:
- branches:
- - docs
-
-permissions:
- contents: write
-jobs:
- deploy:
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v4
- - name: Configure Git Credentials
- run: |
- git config --global user.name 'GitHub Actions'
- git config --global user.email 'actions@github.com'
- - name: Set up Python 3.11
- uses: actions/setup-python@v5
- with:
- python-version: "3.11"
- - name: Install Dependencies
- run: pip install mkdocs-material[imaging] mkdocs-autorefs mkdocs-get-deps mkdocs-material-extensions mkdocstrings mkdocstrings-python mkdocs-git-revision-date-localized-plugin
- - name: Deploy docs
- run: mkdocs gh-deploy --force --verbose
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
deleted file mode 100644
index b2e9c30..0000000
--- a/.github/workflows/python-publish.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-# This workflow will upload a Python Package to PyPi when a Release is created
-name: Publish Python Package
-
-on:
- release:
- types: [published]
-
-permissions:
- contents: read
-
-env:
- PYPI_USERNAME: __token__
- PYPI_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
-
-jobs:
- publish:
- name: Publish to PyPi
- runs-on: ubuntu-latest
-
- steps:
- - name: Checkout the code
- uses: actions/checkout@v4
-
- - name: Set up Python 3.11
- uses: actions/setup-python@v5
- with:
- python-version: "3.11"
-
- - name: Install Poetry
- run: |
- curl -sSL https://install.python-poetry.org | python - -y
- echo "$HOME/.local/bin" >> $GITHUB_PATH
-
- - name: Install Python package dependencies
- run: |
- poetry config virtualenvs.create false
- poetry install --sync --no-interaction
-
- - name: Inject the latest Code Analyzer JAR
- run: |
- CODE_ANALYZER_URL=$(curl -s https://api.github.com/repos/IBM/codenet-minerva-code-analyzer/releases/latest | jq -r '.assets[] | .browser_download_url')
- echo "Downloading: " $CODE_ANALYZER_URL
- wget -q $CODE_ANALYZER_URL
- echo "Moving codeanalyzer jar to:" ${{ github.workspace }}/cldk/analysis/java/codeanalyzer/jar/
- mv codeanalyzer-*.jar ${{ github.workspace }}/cldk/analysis/java/codeanalyzer/jar/
-
- - name: Build package
- run: poetry build
-
- - name: Publish package distributions to PyPI
- run: poetry publish --username $PYPI_USERNAME --password $PYPI_PASSWORD
-
\ No newline at end of file
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 377b062..cf7c58b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -12,46 +12,55 @@ jobs:
release:
runs-on: ubuntu-latest
- env:
- JAVA_HOME: ${{ github.workspace }}/graalvm-ce-java11-22.3.3
-
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: '3.11'
- - name: Set up JDK 11 from GraalVM
- run: |
- echo "${{ env.JAVA_HOME }}/bin" >> $GITHUB_PATH
- wget https://github.com/graalvm/graalvm-ce-builds/releases/download/vm-22.3.3/graalvm-ce-java11-linux-amd64-22.3.3.tar.gz
- tar -xvzf graalvm-ce-java11-linux-amd64-22.3.3.tar.gz
- ${{ env.JAVA_HOME }}/bin/gu install native-image
+ - name: Set up GraalVM CE Java 11
+ uses: graalvm/setup-graalvm@v1
+ with:
+ java-version: '11'
+ distribution: 'graalvm-community'
+ github-token: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Install jq
+ run: sudo apt-get update && sudo apt-get install -y jq
- name: Install Poetry
run: |
curl -sSL https://install.python-poetry.org | python3 -
echo "${HOME}/.local/bin" >> $GITHUB_PATH
+ export PATH="${HOME}/.local/bin:$PATH"
- - name: Configure Poetry
- run: poetry config virtualenvs.in-project true
-
- - name: Install Dependencies
- run: poetry install --no-root
+ - name: Install Python package dependencies
+ run: |
+ poetry config virtualenvs.create false
+ poetry install --sync --no-interaction
- name: Run Tests
- id: build
- continue-on-error: true # Allow workflow continuation on failure
+ id: test
+ continue-on-error: true
run: poetry run make test
- name: Delete tag on failure
- if: steps.build.outcome != 'success'
+ if: steps.test.conclusion == 'failure'
run: |
+ echo "Tests failed. Deleting tag ${GITHUB_REF#refs/tags/}..."
git push --delete origin ${GITHUB_REF#refs/tags/}
- exit 1 # Fail the workflow
+ exit 1
+
+ - name: Inject the latest Code Analyzer JAR
+ run: |
+ CODE_ANALYZER_URL=$(curl -s https://api.github.com/repos/codellm-devkit/codeanalyzer-java/releases/latest | jq -r '.assets[] | select(.name | endswith(".jar")) | .browser_download_url')
+ echo "Downloading: $CODE_ANALYZER_URL"
+ wget -q "$CODE_ANALYZER_URL"
+ mkdir -p ${{ github.workspace }}/cldk/analysis/java/codeanalyzer/jar/
+ mv codeanalyzer-*.jar ${{ github.workspace }}/cldk/analysis/java/codeanalyzer/jar/
- name: Build Package
run: poetry build
@@ -72,3 +81,6 @@ jobs:
body: ${{ steps.gen_changelog.outputs.changelog }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Publish package distributions to PyPI
+ run: poetry publish --username __token__ --password ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.github/workflows/release_config.json b/.github/workflows/release_config.json
index f0d4b5b..200120c 100644
--- a/.github/workflows/release_config.json
+++ b/.github/workflows/release_config.json
@@ -1,36 +1,62 @@
{
"categories": [
+ {
+ "title": "## ✨ Release",
+ "labels": [
+ "release"
+ ]
+ },
{
"title": "## 🚀 Features",
- "labels": ["kind/feature", "enhancement"]
+ "labels": [
+ "kind/feature",
+ "enhancement"
+ ]
},
{
"title": "## 🐛 Fixes",
- "labels": ["fix", "bug"]
+ "labels": [
+ "fix",
+ "bug"
+ ]
},
{
"title": "## ♻️ Refactoring",
- "labels": ["refactoring"]
+ "labels": [
+ "refactoring"
+ ]
},
{
"title": "## ⚡️ Performance Improvements",
- "labels": ["performance"]
+ "labels": [
+ "performance"
+ ]
},
{
"title": "## \uD83D\uDCDA Documentation",
- "labels": ["documentation", "doc"]
+ "labels": [
+ "documentation",
+ "doc"
+ ]
},
{
"title": "## \uD83D\uDEA6 Tests",
- "labels": ["test"]
+ "labels": [
+ "test"
+ ]
},
{
"title": "## \uD83D\uDEE0 Other Updates",
- "labels": ["other", "kind/dependency-change"]
+ "labels": [
+ "other",
+ "kind/dependency-change"
+ ]
},
{
"title": "## 🚨 Breaking Changes",
- "labels": ["breaking"]
+ "labels": [
+ "breaking"
+ ]
}
],
"ignore_labels": [
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..21c4bd5
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,66 @@
+# Contributing to Codellm-Devkit
+
+Thank you for your interest in contributing to the Codellm-Devkit (CLDK)! Here we provide guidelines for contributing to any repository in the codellm-devkit organization.
+
+## How to Contribute
+
+### Issues and Discussions
+- For bugs and actionable items, please prefer creating an issue in the relevant repository
+- For open-ended or design discussions _specifically related to the specification_, use our [specification discussions](https://github.com/codellm-devkit/specification/discussions)
+- For other general discussions that are not suitable as issues, use our [organization discussions](https://github.com/orgs/codellm-devkit/discussions)
+
+In all cases, please check for duplicates before creating new issues or discussions!
+
+### Pull Requests
+We welcome PRs across all our repositories! When submitting:
+- Fork the repository
+- Follow existing code style
+- Include tests where applicable
+- Update documentation as needed
+- Link related issues
+
+## Development Guidelines
+
+### Code Quality
+- Follow the repository's established patterns
+- Include appropriate documentation
+- Add tests for new functionality
+- Handle errors appropriately
+
+### Documentation
+- Keep READMEs current
+- Document configuration options
+- Provide clear examples
+- Include setup instructions
+
+### Security
+- Follow security best practices
+- Implement proper input validation
+- Document security considerations
+
+## Getting Started
+
+1. Fork the repository
+2. Clone your fork:
+ ```bash
+ git clone https://github.com/your-username/repository-name.git
+ ```
+3. Create a feature branch:
+ ```bash
+ git checkout -b my-feature
+ ```
+4. Make your changes and commit:
+ ```bash
+ git commit -m "Description of changes"
+ ```
+5. Push and create a Pull Request
+
+## Code of Conduct
+
+Please note that this project is released with a [Code of Conduct](CODE_OF_CONDUCT.md). By participating in this project you agree to abide by its terms.
+
+## License
+
+By contributing, you agree that your contributions will be licensed under the Apache 2.0 License.
+
+Thank you for contributing to Codellm-Devkit!
\ No newline at end of file
diff --git a/README.md b/README.md
index 383c38a..dc40a55 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
+
@@ -54,257 +54,195 @@ Codellm-Devkit is an ongoing project, developed at IBM Research.
For any questions, feedback, or suggestions, please contact the authors:
-| Name | Email |
-| ---- | ----- |
-| Rahul Krishna | [i.m.ralk@gmail.com](mailto:imralk+oss@gmail.com) |
-| Rangeet Pan | [rangeet.pan@ibm.com](mailto:rangeet.pan@gmail.com) |
-| Saurabh Sihna | [sinhas@us.ibm.com](mailto:sinhas@us.ibm.com) |
+| Name | Email |
+| ------------- | --------------------------------------------------- |
+| Rahul Krishna | [i.m.ralk@gmail.com](mailto:imralk+oss@gmail.com) |
+| Rangeet Pan | [rangeet.pan@ibm.com](mailto:rangeet.pan@gmail.com) |
+| Saurabh Sihna | [sinhas@us.ibm.com](mailto:sinhas@us.ibm.com) |
## Table of Contents
- [Contact](#contact)
- [Table of Contents](#table-of-contents)
+- [Quick Start](#quick-start)
- [Architectural and Design Overview](#architectural-and-design-overview)
-- [Quick Start: Example Walkthrough](#quick-start-example-walkthrough)
- - [Prerequisites](#prerequisites)
- - [Step 1: Set up an Ollama server](#step-1--set-up-an-ollama-server)
- - [Pull the latest version of Granite 8b instruct model from ollama](#pull-the-latest-version-of-granite-8b-instruct-model-from-ollama)
- - [Step 2: Install CLDK](#step-2--install-cldk)
- - [Step 3: Build a code summarization pipeline](#step-3--build-a-code-summarization-pipeline)
+ - [1. **Data Models**](#1-data-models)
+ - [2. **Analysis Backends**](#2-analysis-backends)
+ - [Java](#java)
+ - [Python](#python)
+ - [C](#c)
+ - [3. **Utilities and Extensions**](#3-utilities-and-extensions)
+- [Contributing](#contributing)
- [Publication (papers and blogs related to CLDK)](#publication-papers-and-blogs-related-to-cldk)
-## Architectural and Design Overview
-
-Below is a very high-level overview of the architectural of CLDK:
-
-
-```mermaid
-graph TD
-User <--> A[CLDK]
- A --> 15[Retrieval ‡]
- A --> 16[Prompting ‡]
- A[CLDK] <--> B[Languages]
- B --> C[Java, Python, Go ‡, C ‡, JavaScript ‡, TypeScript ‡, Rust ‡]
- C --> D[Data Models]
- D --> 13{Pydantic}
- 13 --> 7
- C --> 7{backends}
- 7 <--> 9[WALA]
- 9 <--> 14[Analysis]
- 7 <--> 10[Tree-sitter]
- 10 <--> 14[Analysis]
- 7 <--> 11[LLVM ‡]
- 11 <--> 14[Analysis]
- 7 <--> 12[CodeQL ‡]
- 12 <--> 14[Analysis]
-
+## Quick Start
-X[‡ Yet to be implemented]
-```
+In this section, we will walk through a simple example to demonstrate how to get started with CLDK.
-The user interacts by invoking the CLDK API. The CLDK API is responsible for handling the user requests and delegating them to the appropriate language-specific modules.
+1. Install the CLDK package using pip:
+
+ ```bash
+ pip install cldk
+ ```
-Each language comprises of two key components: data models and backends.
-1. **Data Models:** These are high level abstractions that represent the various language constructs and componentes in a structured format using pydantic. This confers a high degree of flexibility and extensibility to the models as well as allowing for easy accees of various data components via a simple dot notation. In addition, the data models are designed to be easily serializable and deserializable, making it easy to store and retrieve data from various sources.
+2. To use CLDK, just import the `CLDK` class from the `cldk` module:
+
+ ```python
+ from cldk import CLDK
+ ```
-2. **Analysis Backends:** These are the components that are responsible for interfacing with the various program analysis tools. The core backends are Treesitter, Javaparse, WALA, LLVM, and CodeQL. The backends are responsible for handling the user requests and delegating them to the appropriate analysis tools. The analysis tools perfrom the requisite analysis and return the results to the user. The user merely calls one of several high-level API functions such as `get_method_body`, `get_method_signature`, `get_call_graph`, etc. and the backend takes care of the rest.
+3. Next, to select a language for analysis, create an instance of the `CLDK` class with the desired language:
- Some langugages may have multiple backends. For example, Java has WALA, Javaparser, Treesitter, and CodeQL backends. The user has freedom to choose the backend that best suits their needs.
+ ```python
+ cldk = CLDK(language="java") # For Java analysis
+ ```
-We are currently working on implementing the retrieval and prompting components. The retrieval component will be responsible for retrieving the relevant code snippets from the codebase for RAG usecases. The prompting component will be responsible for generating the prompts for the CodeLLMs using popular prompting frameworks such as `PDL`, `Guidance`, or `LMQL`.
+4. Create an analysis object over the Java application by providing the path to the project:
-## Quick Start: Example Walkthrough
+ ```python
+ analysis = cldk.analysis(project_path="/path/to/your/java/project")
+ ```
+ This will initialize the analysis pipeline for the specified project. The analysis engine, in the backend, will parse the java project and build a symbol table representing the program structure and return the artifact to CLDK which will map it to the CLDK data schema (`cldk/models/java/models.py`).
-In this section, we will walk through a simple example to demonstrate how to use CLDK. We will:
+ Depending on the size of the project, this step may take some time as it involves parsing, building, and statically analyzing the codebase.
-* Set up a local ollama server to interact with CodeLLMs
-* Build a simple code summarization pipeline for a Java and a Python application.
+5. Once the analysis is complete, you can call the various methods provided by the `analysis` object to interact with the analyzed codebase. For example, you can retrieve method bodies, signatures, and call graphs.
-### Prerequisites
+ ```python
+ # Iterate over all the files in the project
+ from CLDK import cldk
-Before we begin, make sure you have the following prerequisites installed:
+ analysis: JavaAnalysis = CLDK(language="java").analysis(project_path="/path/to/your/java/project")
+
+ all_files = [file_path for file_path, class_file in analysis.get_symbol_table().items()]
- * Python 3.11 or later
- * Ollama v0.3.4 or later
+ # Process each file
+ for file_path in all_files:
+ # Additional processing can be done here
+ pass
+ ```
-If you are using [Visual Studio Code](https://code.visualstudio.com) with the [Dev Containers](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) extension along with [Docker Desktop](https://www.docker.com/products/docker-desktop) or [Rancher Desktop](https://w3.ibm.com/w3publisher/docker-desktop/rancher-desktop), this project contains a Dev Container environment for you to develop in.
+ Likewise, you can also retrieve method bodies.
-Use the following commands:
+ ```python
+ from cldk import CLDK
-```bash
-git clone https://github.com/codellm-devkit/python-sdk.git
-cd python-dsk
-code .
-```
+ analysis: JavaAnalysis = CLDK(language="java").analysis(project_path="/path/to/your/java/project")
+ for class_file in analysis.get_symbol_table().values():
+ for type_name, type_declaration in class_file.type_declarations.items():
+ for method in type_declaration.callable_declarations.values():
+ method_body = analysis.get_method_body(method.declaration)
+ print(f"Method: {method.declaration}\nBody: {method_body}\n")
+ ```
-When Visual Studio Code starts, select the option to **Reopen in Container** and a development environment with Python, Java, C, and Rust will be available to you. See [Developing inside a Container](https://code.visualstudio.com/docs/devcontainers/containers) for more detailed instructions.
+## Architectural and Design Overview
-### Step 1: Set up an Ollama server
+Below is a very high-level overview of the architectural of CLDK:
-If don't already have ollama, please download and install it from here: [Ollama](https://ollama.com/download).
-Once you have ollama, start the server and make sure it is running.
+```mermaid
+graph TD
+User <--> A[CLDK]
-If you're on MacOS, Linux, or WSL, you can check to make sure the server is running by running the following command:
+ A --> A1[cldk.analysis]
+
+ A1 --> A2[cldk.analysis.java]
+ A2 --> A3[codeanalyzer → WALA]
+ A3 --> JA[Analysis]
+
+ A1 --> A4[cldk.analysis.c]
+ A4 --> A5[clang]
+ A5 --> CA[Analysis]
+
+ A1 --> A6[cldk.analysis.python]
+ A6 --> A7[treesitter_python]
+ A7 --> PA[Analysis]
+
+ A1 --> A8[cldk.analysis.commons]
+ A8 --> LSP[LSP]
+ A8 --> TS[treesitter base]
+ A8 --> TU[treesitter utils]
+
+ A --> M[cldk.models]
+ M --> MJ[Java models]
+ M --> MP[Python models]
+ M --> MC[C models]
+ M --> MT[treesitter models]
+
+ A --> U[cldk.utils]
+ U --> UX[exceptions]
+ U --> UL[logging]
+ U --> US[sanitization]
+ US --> USJ[java sanitization]
-```bash
-sudo systemctl status ollama
```
-You should see an output similar to the following:
-
-```bash
-➜ sudo systemctl status ollama
-● ollama.service - Ollama Service
- Loaded: loaded (/etc/systemd/system/ollama.service; enabled; preset: enabled)
- Active: active (running) since Sat 2024-08-10 20:39:56 EDT; 17s ago
- Main PID: 23069 (ollama)
- Tasks: 19 (limit: 76802)
- Memory: 1.2G (peak: 1.2G)
- CPU: 6.745s
- CGroup: /system.slice/ollama.service
- └─23069 /usr/local/bin/ollama serve
-```
+The user interacts with the CLDK API via the top-level `CLDK` interface exposed in `core.py`. This interface is responsible for configuring the analysis session, initializing language-specific pipelines, and exposing a high-level, language-agnostic API for interacting with program structure and semantics.
-If not, you may have to start the server manually. You can do this by running the following command:
+CLDK is currently implemented with full support for **Java**, **Python**, and **C**. Each language module is structured around two core components: **data models** and **analysis backends**.
-```bash
-sudo systemctl start ollama
-```
-#### Pull the latest version of Granite 8b instruct model from ollama
+### 1. **Data Models**
-To pull the latest version of the Granite 8b instruct model from ollama, run the following command:
+Each supported language has its own set of Pydantic-based data models, located in the `cldk.models` module (e.g., `cldk.models.java`, `cldk.models.python`, `cldk.models.c`). These models provide:
-```bash
-ollama pull granite-code:8b-instruct
-```
+- **Structured representations** of language elements such as classes, methods, annotations, fields, and statements.
+- **Typed access** using dot notation (e.g., `method.return_type` or `klass.methods`), promoting developer productivity.
+- **Serialization support** to and from JSON and other formats, enabling easy storage, inspection, and exchange of analysis results.
+- **Consistency** across languages via shared modeling conventions and base abstractions, including a common treesitter schema.
-Check to make sure the model was successfully pulled by running the following command:
-```bash
-ollama run granite-code:8b-instruct 'Write a function to print hello world in python'
-```
-The output should be similar to the following:
+### 2. **Analysis Backends**
-```
-➜ ollama run granite-code:8b-instruct 'Write a function to print hello world in python'
+Each language has a dedicated analysis backend implemented under `cldk.analysis.