Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
0258641
added ocrmypdf
JonnyTran Aug 5, 2025
b4e15e6
refactor: enhance PDF preprocessing with configurable settings and in…
JonnyTran Aug 5, 2025
b7b8a1b
feat: add margin analysis to PDF preprocessing with opencv-python
JonnyTran Aug 5, 2025
25a9098
feat: enable PDF preprocessing analysis with new configuration options
JonnyTran Aug 5, 2025
ab7f39b
feat: update PDF preprocessing settings and add new document analysis…
JonnyTran Aug 5, 2025
1a67fbc
feat: add new preprocessing options for Tesseract timeout and text sk…
JonnyTran Aug 5, 2025
3f84066
feat: introduce rotate pages threshold in PDF preprocessing settings …
JonnyTran Aug 5, 2025
9889a44
Merge branch 'develop' into feat/ocrmypdf-on-upload
JonnyTran Aug 6, 2025
42323ea
Merge branch 'develop' into feat/ocrmypdf-on-upload
JonnyTran Aug 6, 2025
b69e49e
merge conflicts
JonnyTran Aug 6, 2025
713c107
initial local commit
priyankeshh Aug 8, 2025
4940b2e
Modified database URL in .env.dev for better compatibility with user …
JonnyTran Aug 8, 2025
0e59377
optimize ocrmypdf params by updating `optimization` level to 0, disab…
JonnyTran Aug 8, 2025
804ed86
Update dependencies and optimize imports for lazy loading
JonnyTran Aug 8, 2025
85322a8
Merge branch 'develop' into feat/ocrmypdf-on-upload
JonnyTran Aug 8, 2025
30c970a
Enable PDF analysis and update preprocessing settings
JonnyTran Aug 8, 2025
1eacf58
fix typechecking
JonnyTran Aug 8, 2025
8d0704c
Refactor image handling and lazy load dependencies
JonnyTran Aug 8, 2025
484efd8
add lazy-loader
JonnyTran Aug 8, 2025
669d989
Refactor document upload jobs and logging
JonnyTran Aug 9, 2025
fcd8fe2
Implement PDF text layer detection using OCRmyPDF
JonnyTran Aug 9, 2025
960a400
fix opengl issues and import errors
JonnyTran Aug 9, 2025
eb952da
Merge branch 'develop' into feat/ocrmypdf-on-upload
JonnyTran Aug 10, 2025
a8d9ef0
chore: EXTRALIT_DATABASE_URL to use a relative path in .env.dev
JonnyTran Aug 10, 2025
290b519
Refactor PDF text layer detection and analysis
JonnyTran Aug 10, 2025
edb2b0c
chore: lazy import bibtexparser
JonnyTran Aug 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -155,3 +155,6 @@ src/**/server/static/
# App generated files
extralit-server/src/extralit_server/static
extralit/site

# Development files
*.db
12 changes: 11 additions & 1 deletion extralit-server/.env.dev
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES # Needed by RQ to work with forked processes on MacOS
ALEMBIC_CONFIG=src/extralit_server/alembic.ini
EXTRALIT_AUTH_SECRET_KEY=8VO7na5N/jQx+yP/N+HlE8q51vPdrxqlh6OzoebIyko= # With this we avoid using a different key every time the server is reloaded
EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///${HOME}/.extralit/extralit-dev.db?check_same_thread=False
EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///./extralit-dev.db?check_same_thread=False
HF_HUB_DISABLE_TELEMETRY=1

# S3 Configuration (skipped to use LocalFileStorage)
Expand All @@ -20,3 +20,13 @@ EXTRALIT_ELASTICSEARCH=http://localhost:9200

# Redis configuration
EXTRALIT_REDIS_URL=redis://localhost:6379/0

# PDF Preprocessing
PREPROCESSING_ENABLED=true
PREPROCESSING_ENABLE_ANALYSIS=true
PREPROCESSING_ROTATE_PAGES=true
PREPROCESSING_ROTATE_PAGES_THRESHOLD=2.0
PREPROCESSING_CLEAN=false
PREPROCESSING_SKIP_TEXT=true
PREPROCESSING_TESSERACT_TIMEOUT=0
PREPROCESSING_QUIET=false
768 changes: 351 additions & 417 deletions extralit-server/pdm.lock

Large diffs are not rendered by default.

12 changes: 9 additions & 3 deletions extralit-server/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "extralit-server"
dynamic = ["version"]
description = "Open-source tool for accurate & fast scientific literature data extraction with LLM and human-in-the-loop."
readme = "README.md"
requires-python = ">=3.9"
requires-python = ">=3.10"
license = { text = "Apache-2.0" }
keywords = [
"literature-review",
Expand Down Expand Up @@ -49,6 +49,7 @@ dependencies = [
"social-auth-core ~= 4.5.0",
# Background processing
"rq ~= 1.16.2",
"lazy-loader>=0.4",
# Info status
"psutil ~= 5.8, <5.10",
# For logging, tracebacks, printing, progressbars
Expand All @@ -60,12 +61,16 @@ dependencies = [
"standardwebhooks>=1.0.0",
# For HF dataset import
"datasets >= 3.0.1",
"pillow >= 10.4.0",
"pillow>=11.3.0",
# For Telemetry
"huggingface-hub>=0.26.2",
"Jinja2>=3.1.4", # Used by huggingface-hub to render dataset card templates
# For file storage
"minio>=7.2.7",
# For document processing
"ocrmypdf>=16.10.4",
"pdf2image>=1.17.0",
"opencv-python>=4.11.0.86",
]

[project.optional-dependencies]
Expand Down Expand Up @@ -182,8 +187,9 @@ worker = { cmd = "python -m extralit_server worker" }
server-dev.composite = [
"migrate",
"cli database users create_default",
"server",
"server-and-worker",
]
server-and-worker = { shell = "pdm run server & pdm run worker & wait" }
test = { cmd = "pytest --verbosity=1 --disable-warnings", env_file = ".env.test" }
test-cov = { cmd = "pytest tests --cov=extralit_server --cov-report=term --cov-report=xml --verbosity=0 --disable-warnings", env_file = ".env.test" }

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright 2024-present, Extralit Labs, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict, List, Optional
from pydantic import BaseModel


class PDFMetadata(BaseModel):
"""
Metadata for PDF processing results.
"""

filename: str
processing_time: float
page_count: Optional[int] = None
language_detected: Optional[List[str]] = None
processing_settings: Optional[Dict] = None
analysis_results: Optional[Dict] = None
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# Copyright 2024-present, Extralit Labs, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import uuid
from typing import Optional, Any, List, Union

from pydantic import BaseModel, Field, validator

"""
This is deprecated code that is outdated and should be used for reference only.
We may want to switch to using LlamaIndexDocument or other document models in the future.
"""


class Segments(BaseModel):
items: List[Union["TextSegment", "TableSegment", "FigureSegment"]] = Field(
default_factory=list,
description="List of segments in the reading order of the document",
)

def get(self, id: str, header: str | None = None, default=None):
for item in self.items:
if item.id == id or (header and item.header == header):
return item

return default

def __repr_str__(self, join_str: str) -> str:
return "\n " + f"{join_str}\n ".join(f"{type(item).__name__}({item})" for item in self.items)

@validator("items", pre=True, each_item=True)
def parse_segments(cls, v):
if not isinstance(v, dict):
v = v.dict()

segment_type = v.get("type", "").lower()
if segment_type in {"figure", "image"}:
return FigureSegment(**v)
elif segment_type == "table" or "html" in v:
return TableSegment(**v)
else:
return TextSegment(**v)

def __getitem__(self, index):
return self.items[index]

def __len__(self):
return len(self.items)


class Coordinates(BaseModel):
points: List[List[float]] = Field(
..., description="List of 4 points, e.g. [[x1, y1], [x2, y1], [x1, y2], [x2, y2]]"
)
layout_width: Optional[int] = Field(None, description="Width of the layout")
layout_height: Optional[int] = Field(None, description="Height of the layout")
system: Optional[str] = Field(description="System of coordinates")

def __repr_str__(self, join_str: str) -> str:
return ""


class TextSegment(BaseModel):
id: str = Field(
default_factory=lambda: str(uuid.uuid4()), description="Unique identifier of the segment", repr=False
)

header: Optional[str] = Field(
None,
description="Header of the element",
)
text: str = Field(..., description="Content as plain text", repr=False)
summary: Optional[str] = Field(None, description="Summary of the content")
page_number: Optional[int] = Field(None, description="Page number of the segment")
coordinates: Optional["Coordinates"] = Field(
None, description="Coordinates of the element in the document", repr=False
)
level: Optional[int] = Field(None, description="Level of the header")
source: Optional[str] = Field(None, description="Source of the element", repr=False)
type: Optional[str] = Field("text", description="Type of the element", repr=False)
original: Optional[Any] = Field(
None, exclude=True, description="Original object from which the segment was extracted", repr=False
)

def text_cleaned(self):
return self.text.replace(" | ", " ").replace("---", "").strip()

def __repr_str__(self, join_str: str) -> str:
return join_str.join(
repr(v)
if a is None
else (
f'{a}="{v[:100]}...{v[-100:]}"'.replace("\n", "")
if isinstance(v, str) and len(v) > 200
else f"{a}={v!r}"
)
for a, v in self.__repr_args__()
if v and a not in {"INCLUDE_METADATA_KEYS"}
)


class TableSegment(TextSegment):
footer: Optional[str] = Field(None, description="Footer of the table or figure, to explain variable acronyms.")
html: Optional[str] = Field(None, description="Content as HTML structured", repr=False)
image: Optional[str] = Field(None, description="URL/filepath of the element's image", repr=False)
probability: Optional[float] = Field(None, description="Probability or confidence of the segment's extraction")
type: Optional[str] = Field("table", description="Type of the element", repr=False)


class FigureSegment(TableSegment):
type: Optional[str] = Field("figure", description="Type of the element", repr=False)
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class DataframeSchema(BaseModel):
class DataframeData(BaseModel):
"""Tabular dataframe representation for generalized import support."""

schema: DataframeSchema = Field(..., description="Schema definition with fields and primary key")
schema_: DataframeSchema = Field(..., alias="schema", description="Schema definition with fields and primary key")
data: List[Dict[str, Any]] = Field(..., description="List of data rows as dictionaries")


Expand Down
34 changes: 23 additions & 11 deletions extralit-server/src/extralit_server/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,30 @@
# Copyright 2021-present, the Recognai S.L. team.
# Copyright 2024-present, Extralit Labs, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .__main__ import app
import typer

from .database import app as database_app
from .search_engine import app as search_engine_app
from .start import start
from .worker import worker

app = typer.Typer(help="Commands for Extralit server management", no_args_is_help=True)

app.add_typer(database_app, name="database")
app.add_typer(search_engine_app, name="search-engine")
app.command(name="worker", help="Starts rq workers")(worker)
app.command(name="start", help="Starts the Extralit server")(start)

if __name__ == "__main__":
app()
16 changes: 1 addition & 15 deletions extralit-server/src/extralit_server/cli/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import typer

from .database import app as database_app
from .search_engine import app as search_engine_app
from .start import start
from .worker import worker

app = typer.Typer(help="Commands for Extralit server management", no_args_is_help=True)


app.add_typer(database_app, name="database")
app.add_typer(search_engine_app, name="search-engine")
app.command(name="worker", help="Starts rq workers")(worker)
app.command(name="start", help="Starts the Extralit server")(start)

from extralit_server.cli import app

if __name__ == "__main__":
app()
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright 2024-present, Extralit Labs, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Loading
Loading