Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions 3rdparty/libs/fileext/doc/doc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,18 @@ int Doc::convert(bool addStyle, bool extractImages, char mergingMode) {

// Separate pargraphs and add them to HTML tags
for (const auto& line : tools::explode(text, "\n\r")) {
if (shouldStopProcessing()) {
break;
}

if (line.empty()) {
m_text += "\u00A0";
if (!safeAppendText("\u00A0")) {
break;
}
} else {
m_text += line + '\n';
if (!safeAppendText(line + '\n')) {
break;
}
}
}
return 0;
Expand Down
5 changes: 4 additions & 1 deletion 3rdparty/libs/fileext/docx/docx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,10 @@ void Docx::getParagraphText(const pugi::xml_node& xmlNode) {
}
}

m_text += text + '\n';
if (!safeAppendText(text + '\n')) {
// Truncation occurred, stop processing
return;
}
}

std::string Docx::getElementText(const pugi::xml_node& xmlNode) {
Expand Down
6 changes: 6 additions & 0 deletions 3rdparty/libs/fileext/excel/excel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ int Excel::convert(bool addStyle, bool extractImages, char mergingMode) {
book->openWorkbookXls();
}

// Apply truncation if enabled
if (m_truncationEnabled && m_text.size() > m_maxBytes) {
m_text = truncateAtBoundary(m_text, m_maxBytes);
m_truncated = true;
}

delete book;
return 0;
}
Expand Down
93 changes: 93 additions & 0 deletions 3rdparty/libs/fileext/fileext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* @package fileext
* @file fileext.cpp
* @author dmryutov (dmryutov@gmail.com)
* @version 1.1.1
* @date 12.07.2016 -- 10.02.2018
*/
#include <fstream>
Expand All @@ -11,6 +12,7 @@

#include "fileext.hpp"
#include <iostream>
#include <algorithm>

namespace fileext {

Expand All @@ -22,4 +24,95 @@ const std::string SCRIPT_FILE = LIB_PATH + "/xpathconfig.min.js";
FileExtension::FileExtension(const std::string& fileName)
: m_fileName(fileName) {}

void FileExtension::setTruncationLimit(size_t maxBytes)
{
m_maxBytes = maxBytes;
m_truncationEnabled = true; // Always enable when limit is set, even if 0
m_truncated = false;
}

bool FileExtension::safeAppendText(const std::string& text)
{
if (!m_truncationEnabled) {
m_text += text;
return true;
}

// Special case: if maxBytes is 0, don't add anything
if (m_maxBytes == 0) {
m_truncated = true;
return false;
}

// Check if adding this text would exceed the limit
size_t currentSize = m_text.size();
size_t newTextSize = text.size();

if (currentSize >= m_maxBytes) {
// Already at or over limit, don't add anything
m_truncated = true;
return false;
}

if (currentSize + newTextSize <= m_maxBytes) {
// Safe to add all text
m_text += text;
return true;
}

// Need to truncate the new text
size_t remainingBytes = m_maxBytes - currentSize;
std::string truncatedText = truncateAtBoundary(text, remainingBytes);

m_text += truncatedText;
m_truncated = true;

return false; // Indicate truncation occurred
}

bool FileExtension::shouldStopProcessing() const
{
return m_truncationEnabled && m_text.size() >= m_maxBytes;
}

std::string FileExtension::truncateAtBoundary(const std::string& text, size_t maxLength) const
{
if (text.size() <= maxLength) {
return text;
}

if (maxLength == 0) {
return "";
}

// Try to find a good boundary to truncate at
std::string truncated = text.substr(0, maxLength);

// Look for sentence boundaries (. ! ?) within the last 50 characters
size_t searchStart = (maxLength > 50) ? maxLength - 50 : 0;
for (size_t i = maxLength - 1; i > searchStart; --i) {
char c = text[i];
if (c == '.' || c == '!' || c == '?' || c == '\n') {
return text.substr(0, i + 1);
}
}

// Look for word boundaries (spaces) within the last 20 characters
searchStart = (maxLength > 20) ? maxLength - 20 : 0;
for (size_t i = maxLength - 1; i > searchStart; --i) {
char c = text[i];
if (c == ' ' || c == '\t' || c == '\n') {
return text.substr(0, i);
}
}

// If no good boundary found, just truncate at the limit
return truncated;
}

std::string FileExtension::applyFinalTruncation(const std::string& content, size_t maxLength)
{
return truncateAtBoundary(content, maxLength);
}

} // End namespace
58 changes: 58 additions & 0 deletions 3rdparty/libs/fileext/fileext.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,35 @@ class FileExtension {

std::string m_text = "";

/**
* @brief Set truncation limit for content processing
* @param[in] maxBytes Maximum bytes to process (0 = no limit)
* @since 1.1.2
*/
void setTruncationLimit(size_t maxBytes);

/**
* @brief Check if content was truncated during processing
* @return true if content was truncated, false otherwise
* @since 1.1.2
*/
bool isTruncated() const { return m_truncated; }

/**
* @brief Apply final truncation to content (public interface)
* @param[in] content Content to truncate
* @param[in] maxLength Maximum length allowed
* @return Truncated content
* @since 1.1.2
*/
std::string applyFinalTruncation(const std::string& content, size_t maxLength);

/**
* @brief Mark content as truncated (for external truncation)
* @since 1.1.2
*/
void markAsTruncated() { m_truncated = true; }

protected:
// int m_maxLen = 0;
/** Name of processing file */
Expand All @@ -77,6 +106,35 @@ class FileExtension {
bool m_extractImages = false;
/** List of images (binary data and extension) */
std::vector<std::pair<std::string, std::string>> m_imageList;

/** Truncation control members */
size_t m_maxBytes = 0; // 0 means no limit
bool m_truncationEnabled = false; // Truncation switch
bool m_truncated = false; // Truncation status flag

/**
* @brief Safely append text with truncation control
* @param[in] text Text to append
* @return true if text was appended, false if truncation occurred
* @since 1.1.2
*/
bool safeAppendText(const std::string& text);

/**
* @brief Check if processing should stop due to truncation
* @return true if processing should stop
* @since 1.1.2
*/
bool shouldStopProcessing() const;

/**
* @brief Truncate text at reasonable boundary (sentence, word, etc.)
* @param[in] text Text to truncate
* @param[in] maxLength Maximum length allowed
* @return Truncated text
* @since 1.1.2
*/
std::string truncateAtBoundary(const std::string& text, size_t maxLength) const;
};

} // End namespace
12 changes: 11 additions & 1 deletion 3rdparty/libs/fileext/pdf/pdf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,24 @@ int Pdf::convert(bool addStyle, bool extractImages, char mergingMode) {

int numPage = doc->pages();
for (int i = 0; i < numPage; ++i) {
// Check if we should stop processing due to truncation
if (shouldStopProcessing()) {
break;
}

poppler::page *page = doc->create_page(i);
if (page) {
const auto &text = page->text();
if (!text.empty()) {
const auto strutf8 = text.to_utf8();
std::string str;
str.assign(strutf8.begin(), strutf8.end());
m_text += str;

if (!safeAppendText(str)) {
// Truncation occurred, stop processing
delete page;
break;
}
}
delete page;
}
Expand Down
29 changes: 21 additions & 8 deletions 3rdparty/libs/fileext/ppt/ppt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,17 +136,23 @@ void Ppt::parseRecord(const std::string &ppd, size_t &offset, int recType, ulong
auto u = readByte<unsigned short>(ppd, offset, 2);
offset += 2;
if (u == 0x0D || u == 0x0B) {
m_text += '\n';
if (!safeAppendText("\n")) {
return;
}
} else {
if (utf16_unichar_has_4_bytes(u) && ++i < textLen) {
auto b = readByte<unsigned short>(ppd, offset, 2);
offset += 2;
u = (u << 16 | b);
}
m_text += unichar_to_utf8(u);
if (!safeAppendText(unichar_to_utf8(u))) {
return;
}
}
}
m_text += '\n';
if (!safeAppendText("\n")) {
return;
}
break;
}
case RT_TEXT_BYTES_ATOM: {
Expand All @@ -157,12 +163,19 @@ void Ppt::parseRecord(const std::string &ppd, size_t &offset, int recType, ulong
for (int i = 0; i < textLen; ++i) {
auto u = readByte<unsigned short>(ppd, offset, 1);
++offset;
if (u == 0x0B || u == 0x0D)
m_text += '\n';
else
m_text += unichar_to_utf8(u);
if (u == 0x0B || u == 0x0D) {
if (!safeAppendText("\n")) {
return;
}
} else {
if (!safeAppendText(unichar_to_utf8(u))) {
return;
}
}
}
if (!safeAppendText("\n")) {
return;
}
m_text += '\n';
break;
}
case OFFICE_ART_SP_CONTAINER:
Expand Down
5 changes: 4 additions & 1 deletion 3rdparty/libs/fileext/pptx/pptx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ int Pptx::convert(bool addStyle, bool extractImages, char mergingMode) {
Ooxml::extractFile(m_fileName, xmlName, tree);
TreeWalker walker;
tree.traverse(walker);
m_text += walker.content;
if (!safeAppendText(walker.content)) {
// Truncation occurred, stop processing
break;
}
}

return 0;
Expand Down
11 changes: 8 additions & 3 deletions 3rdparty/libs/fileext/txt/txt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,15 @@ int Txt::convert(bool addStyle, bool extractImages, char mergingMode)
{
std::string line;
std::ifstream inputFile(m_fileName);
while (getline(inputFile, line))
m_text += line + '\n';

while (getline(inputFile, line)) {
if (!safeAppendText(line + '\n')) {
// Truncation occurred, stop processing
break;
}
}

inputFile.close();

return 0;
}

Expand Down
6 changes: 6 additions & 0 deletions 3rdparty/libs/fileext/xlsb/xlsb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ int Xlsb::convert(bool addStyle, bool extractImages, char mergingMode)
if (!parseWorkSheets(m_text))
return -1;

// Apply truncation if enabled
if (m_truncationEnabled && m_text.size() > m_maxBytes) {
m_text = truncateAtBoundary(m_text, m_maxBytes);
m_truncated = true;
}

return 0;
}

Expand Down
6 changes: 6 additions & 0 deletions debian/changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
docparser (1.0.20) unstable; urgency=medium

* implement truncation functionality in document parser

-- Zhang Sheng <zhangsheng@uniontech.com> Tue, 24 Jun 2025 14:14:45 +0800

docparser (1.0.19) unstable; urgency=medium

* integrate libmagic for MIME type detection in docparser
Expand Down
Loading
Loading