Extralit · JonnyTran · Aug 12, 2025 · Aug 5, 2025 · Aug 5, 2025 · Aug 5, 2025
diff --git a/.gitignore b/.gitignore
@@ -155,3 +155,6 @@ src/**/server/static/
 # App generated files
 extralit-server/src/extralit_server/static
 extralit/site
+
+# Development files
+*.db
@@ -1,7 +1,7 @@
 OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES # Needed by RQ to work with forked processes on MacOS
 ALEMBIC_CONFIG=src/extralit_server/alembic.ini
 EXTRALIT_AUTH_SECRET_KEY=8VO7na5N/jQx+yP/N+HlE8q51vPdrxqlh6OzoebIyko= # With this we avoid using a different key every time the server is reloaded
-EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///${HOME}/.extralit/extralit-dev.db?check_same_thread=False
+EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///./extralit-dev.db?check_same_thread=False
 HF_HUB_DISABLE_TELEMETRY=1
 
 # S3 Configuration (skipped to use LocalFileStorage)
@@ -20,3 +20,13 @@ EXTRALIT_ELASTICSEARCH=http://localhost:9200
 
 # Redis configuration
 EXTRALIT_REDIS_URL=redis://localhost:6379/0
+
+# PDF Preprocessing
+PREPROCESSING_ENABLED=true
+PREPROCESSING_ENABLE_ANALYSIS=true
+PREPROCESSING_ROTATE_PAGES=true
+PREPROCESSING_ROTATE_PAGES_THRESHOLD=2.0
+PREPROCESSING_CLEAN=false
+PREPROCESSING_SKIP_TEXT=true
+PREPROCESSING_TESSERACT_TIMEOUT=0
+PREPROCESSING_QUIET=false
@@ -3,7 +3,7 @@ name = "extralit-server"
 dynamic = ["version"]
 description = "Open-source tool for accurate & fast scientific literature data extraction with LLM and human-in-the-loop."
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 license = { text = "Apache-2.0" }
 keywords = [
     "literature-review",
@@ -49,6 +49,7 @@ dependencies = [
     "social-auth-core ~= 4.5.0",
     # Background processing
     "rq ~= 1.16.2",
+    "lazy-loader>=0.4",
     # Info status
     "psutil ~= 5.8, <5.10",
     # For logging, tracebacks, printing, progressbars
@@ -60,12 +61,16 @@ dependencies = [
     "standardwebhooks>=1.0.0",
     # For HF dataset import
     "datasets >= 3.0.1",
-    "pillow >= 10.4.0",
+    "pillow>=11.3.0",
     # For Telemetry
     "huggingface-hub>=0.26.2",
     "Jinja2>=3.1.4",           # Used by huggingface-hub to render dataset card templates
     # For file storage
     "minio>=7.2.7",
+    # For document processing
+    "ocrmypdf>=16.10.4",
+    "pdf2image>=1.17.0",
+    "opencv-python>=4.11.0.86",
 ]
 
 [project.optional-dependencies]
@@ -182,8 +187,9 @@ worker = { cmd = "python -m extralit_server worker" }
 server-dev.composite = [
     "migrate",
     "cli database users create_default",
-    "server",
+    "server-and-worker",
 ]
+server-and-worker = { shell = "pdm run server & pdm run worker & wait" }
 test = { cmd = "pytest --verbosity=1 --disable-warnings", env_file = ".env.test" }
 test-cov = { cmd = "pytest tests --cov=extralit_server --cov-report=term --cov-report=xml --verbosity=0 --disable-warnings", env_file = ".env.test" }
 

@@ -0,0 +1,29 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional
+from pydantic import BaseModel
+
+
+class PDFMetadata(BaseModel):
+    """
+    Metadata for PDF processing results.
+    """
+
+    filename: str
+    processing_time: float
+    page_count: Optional[int] = None
+    language_detected: Optional[List[str]] = None
+    processing_settings: Optional[Dict] = None
+    analysis_results: Optional[Dict] = None
@@ -0,0 +1,122 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import uuid
+from typing import Optional, Any, List, Union
+
+from pydantic import BaseModel, Field, validator
+
+"""
+This is deprecated code that is outdated and should be used for reference only.
+We may want to switch to using LlamaIndexDocument or other document models in the future.
+"""
+
+
+class Segments(BaseModel):
+    items: List[Union["TextSegment", "TableSegment", "FigureSegment"]] = Field(
+        default_factory=list,
+        description="List of segments in the reading order of the document",
+    )
+
+    def get(self, id: str, header: str | None = None, default=None):
+        for item in self.items:
+            if item.id == id or (header and item.header == header):
+                return item
+
+        return default
+
+    def __repr_str__(self, join_str: str) -> str:
+        return "\n  " + f"{join_str}\n  ".join(f"{type(item).__name__}({item})" for item in self.items)
+
+    @validator("items", pre=True, each_item=True)
+    def parse_segments(cls, v):
+        if not isinstance(v, dict):
+            v = v.dict()
+
+        segment_type = v.get("type", "").lower()
+        if segment_type in {"figure", "image"}:
+            return FigureSegment(**v)
+        elif segment_type == "table" or "html" in v:
+            return TableSegment(**v)
+        else:
+            return TextSegment(**v)
+
+    def __getitem__(self, index):
+        return self.items[index]
+
+    def __len__(self):
+        return len(self.items)
+
+
+class Coordinates(BaseModel):
+    points: List[List[float]] = Field(
+        ..., description="List of 4 points, e.g. [[x1, y1], [x2, y1], [x1, y2], [x2, y2]]"
+    )
+    layout_width: Optional[int] = Field(None, description="Width of the layout")
+    layout_height: Optional[int] = Field(None, description="Height of the layout")
+    system: Optional[str] = Field(description="System of coordinates")
+
+    def __repr_str__(self, join_str: str) -> str:
+        return ""
+
+
+class TextSegment(BaseModel):
+    id: str = Field(
+        default_factory=lambda: str(uuid.uuid4()), description="Unique identifier of the segment", repr=False
+    )
+
+    header: Optional[str] = Field(
+        None,
+        description="Header of the element",
+    )
+    text: str = Field(..., description="Content as plain text", repr=False)
+    summary: Optional[str] = Field(None, description="Summary of the content")
+    page_number: Optional[int] = Field(None, description="Page number of the segment")
+    coordinates: Optional["Coordinates"] = Field(
+        None, description="Coordinates of the element in the document", repr=False
+    )
+    level: Optional[int] = Field(None, description="Level of the header")
+    source: Optional[str] = Field(None, description="Source of the element", repr=False)
+    type: Optional[str] = Field("text", description="Type of the element", repr=False)
+    original: Optional[Any] = Field(
+        None, exclude=True, description="Original object from which the segment was extracted", repr=False
+    )
+
+    def text_cleaned(self):
+        return self.text.replace(" | ", " ").replace("---", "").strip()
+
+    def __repr_str__(self, join_str: str) -> str:
+        return join_str.join(
+            repr(v)
+            if a is None
+            else (
+                f'{a}="{v[:100]}...{v[-100:]}"'.replace("\n", "")
+                if isinstance(v, str) and len(v) > 200
+                else f"{a}={v!r}"
+            )
+            for a, v in self.__repr_args__()
+            if v and a not in {"INCLUDE_METADATA_KEYS"}
+        )
+
+
+class TableSegment(TextSegment):
+    footer: Optional[str] = Field(None, description="Footer of the table or figure, to explain variable acronyms.")
+    html: Optional[str] = Field(None, description="Content as HTML structured", repr=False)
+    image: Optional[str] = Field(None, description="URL/filepath of the element's image", repr=False)
+    probability: Optional[float] = Field(None, description="Probability or confidence of the segment's extraction")
+    type: Optional[str] = Field("table", description="Type of the element", repr=False)
+
+
+class FigureSegment(TableSegment):
+    type: Optional[str] = Field("figure", description="Type of the element", repr=False)
@@ -68,7 +68,7 @@ class DataframeSchema(BaseModel):
 class DataframeData(BaseModel):
     """Tabular dataframe representation for generalized import support."""
 
-    schema: DataframeSchema = Field(..., description="Schema definition with fields and primary key")
+    schema_: DataframeSchema = Field(..., alias="schema", description="Schema definition with fields and primary key")
     data: List[Dict[str, Any]] = Field(..., description="List of data rows as dictionaries")
 
 

@@ -1,18 +1,30 @@
-#  Copyright 2021-present, the Recognai S.L. team.
+# Copyright 2024-present, Extralit Labs, Inc.
 #
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#      http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-from .__main__ import app
+import typer
+
+from .database import app as database_app
+from .search_engine import app as search_engine_app
+from .start import start
+from .worker import worker
+
+app = typer.Typer(help="Commands for Extralit server management", no_args_is_help=True)
+
+app.add_typer(database_app, name="database")
+app.add_typer(search_engine_app, name="search-engine")
+app.command(name="worker", help="Starts rq workers")(worker)
+app.command(name="start", help="Starts the Extralit server")(start)
 
 if __name__ == "__main__":
     app()
@@ -12,21 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import typer
-
-from .database import app as database_app
-from .search_engine import app as search_engine_app
-from .start import start
-from .worker import worker
-
-app = typer.Typer(help="Commands for Extralit server management", no_args_is_help=True)
-
-
-app.add_typer(database_app, name="database")
-app.add_typer(search_engine_app, name="search-engine")
-app.command(name="worker", help="Starts rq workers")(worker)
-app.command(name="start", help="Starts the Extralit server")(start)
-
+from extralit_server.cli import app
 
 if __name__ == "__main__":
     app()
@@ -0,0 +1,14 @@
+# Copyright 2024-present, Extralit Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+