diff --git a/3rdparty/libs/fileext/doc/doc.cpp b/3rdparty/libs/fileext/doc/doc.cpp index 223de09..703c0b1 100644 --- a/3rdparty/libs/fileext/doc/doc.cpp +++ b/3rdparty/libs/fileext/doc/doc.cpp @@ -137,10 +137,18 @@ int Doc::convert(bool addStyle, bool extractImages, char mergingMode) { // Separate pargraphs and add them to HTML tags for (const auto& line : tools::explode(text, "\n\r")) { + if (shouldStopProcessing()) { + break; + } + if (line.empty()) { - m_text += "\u00A0"; + if (!safeAppendText("\u00A0")) { + break; + } } else { - m_text += line + '\n'; + if (!safeAppendText(line + '\n')) { + break; + } } } return 0; diff --git a/3rdparty/libs/fileext/docx/docx.cpp b/3rdparty/libs/fileext/docx/docx.cpp index 8b5ea63..5ef3eab 100644 --- a/3rdparty/libs/fileext/docx/docx.cpp +++ b/3rdparty/libs/fileext/docx/docx.cpp @@ -293,7 +293,10 @@ void Docx::getParagraphText(const pugi::xml_node& xmlNode) { } } - m_text += text + '\n'; + if (!safeAppendText(text + '\n')) { + // Truncation occurred, stop processing + return; + } } std::string Docx::getElementText(const pugi::xml_node& xmlNode) { diff --git a/3rdparty/libs/fileext/excel/excel.cpp b/3rdparty/libs/fileext/excel/excel.cpp index 7162a80..c2b3fd5 100644 --- a/3rdparty/libs/fileext/excel/excel.cpp +++ b/3rdparty/libs/fileext/excel/excel.cpp @@ -65,6 +65,12 @@ int Excel::convert(bool addStyle, bool extractImages, char mergingMode) { book->openWorkbookXls(); } + // Apply truncation if enabled + if (m_truncationEnabled && m_text.size() > m_maxBytes) { + m_text = truncateAtBoundary(m_text, m_maxBytes); + m_truncated = true; + } + delete book; return 0; } diff --git a/3rdparty/libs/fileext/fileext.cpp b/3rdparty/libs/fileext/fileext.cpp index 201fdb6..f92d65d 100644 --- a/3rdparty/libs/fileext/fileext.cpp +++ b/3rdparty/libs/fileext/fileext.cpp @@ -3,6 +3,7 @@ * @package fileext * @file fileext.cpp * @author dmryutov (dmryutov@gmail.com) + * @version 1.1.1 * @date 12.07.2016 -- 10.02.2018 */ #include @@ -11,6 +12,7 @@ #include "fileext.hpp" #include +#include namespace fileext { @@ -22,4 +24,95 @@ const std::string SCRIPT_FILE = LIB_PATH + "/xpathconfig.min.js"; FileExtension::FileExtension(const std::string& fileName) : m_fileName(fileName) {} +void FileExtension::setTruncationLimit(size_t maxBytes) +{ + m_maxBytes = maxBytes; + m_truncationEnabled = true; // Always enable when limit is set, even if 0 + m_truncated = false; +} + +bool FileExtension::safeAppendText(const std::string& text) +{ + if (!m_truncationEnabled) { + m_text += text; + return true; + } + + // Special case: if maxBytes is 0, don't add anything + if (m_maxBytes == 0) { + m_truncated = true; + return false; + } + + // Check if adding this text would exceed the limit + size_t currentSize = m_text.size(); + size_t newTextSize = text.size(); + + if (currentSize >= m_maxBytes) { + // Already at or over limit, don't add anything + m_truncated = true; + return false; + } + + if (currentSize + newTextSize <= m_maxBytes) { + // Safe to add all text + m_text += text; + return true; + } + + // Need to truncate the new text + size_t remainingBytes = m_maxBytes - currentSize; + std::string truncatedText = truncateAtBoundary(text, remainingBytes); + + m_text += truncatedText; + m_truncated = true; + + return false; // Indicate truncation occurred +} + +bool FileExtension::shouldStopProcessing() const +{ + return m_truncationEnabled && m_text.size() >= m_maxBytes; +} + +std::string FileExtension::truncateAtBoundary(const std::string& text, size_t maxLength) const +{ + if (text.size() <= maxLength) { + return text; + } + + if (maxLength == 0) { + return ""; + } + + // Try to find a good boundary to truncate at + std::string truncated = text.substr(0, maxLength); + + // Look for sentence boundaries (. ! ?) within the last 50 characters + size_t searchStart = (maxLength > 50) ? maxLength - 50 : 0; + for (size_t i = maxLength - 1; i > searchStart; --i) { + char c = text[i]; + if (c == '.' || c == '!' || c == '?' || c == '\n') { + return text.substr(0, i + 1); + } + } + + // Look for word boundaries (spaces) within the last 20 characters + searchStart = (maxLength > 20) ? maxLength - 20 : 0; + for (size_t i = maxLength - 1; i > searchStart; --i) { + char c = text[i]; + if (c == ' ' || c == '\t' || c == '\n') { + return text.substr(0, i); + } + } + + // If no good boundary found, just truncate at the limit + return truncated; +} + +std::string FileExtension::applyFinalTruncation(const std::string& content, size_t maxLength) +{ + return truncateAtBoundary(content, maxLength); +} + } // End namespace diff --git a/3rdparty/libs/fileext/fileext.hpp b/3rdparty/libs/fileext/fileext.hpp index aee480d..331b0bd 100644 --- a/3rdparty/libs/fileext/fileext.hpp +++ b/3rdparty/libs/fileext/fileext.hpp @@ -58,6 +58,35 @@ class FileExtension { std::string m_text = ""; + /** + * @brief Set truncation limit for content processing + * @param[in] maxBytes Maximum bytes to process (0 = no limit) + * @since 1.1.2 + */ + void setTruncationLimit(size_t maxBytes); + + /** + * @brief Check if content was truncated during processing + * @return true if content was truncated, false otherwise + * @since 1.1.2 + */ + bool isTruncated() const { return m_truncated; } + + /** + * @brief Apply final truncation to content (public interface) + * @param[in] content Content to truncate + * @param[in] maxLength Maximum length allowed + * @return Truncated content + * @since 1.1.2 + */ + std::string applyFinalTruncation(const std::string& content, size_t maxLength); + + /** + * @brief Mark content as truncated (for external truncation) + * @since 1.1.2 + */ + void markAsTruncated() { m_truncated = true; } + protected: // int m_maxLen = 0; /** Name of processing file */ @@ -77,6 +106,35 @@ class FileExtension { bool m_extractImages = false; /** List of images (binary data and extension) */ std::vector> m_imageList; + + /** Truncation control members */ + size_t m_maxBytes = 0; // 0 means no limit + bool m_truncationEnabled = false; // Truncation switch + bool m_truncated = false; // Truncation status flag + + /** + * @brief Safely append text with truncation control + * @param[in] text Text to append + * @return true if text was appended, false if truncation occurred + * @since 1.1.2 + */ + bool safeAppendText(const std::string& text); + + /** + * @brief Check if processing should stop due to truncation + * @return true if processing should stop + * @since 1.1.2 + */ + bool shouldStopProcessing() const; + + /** + * @brief Truncate text at reasonable boundary (sentence, word, etc.) + * @param[in] text Text to truncate + * @param[in] maxLength Maximum length allowed + * @return Truncated text + * @since 1.1.2 + */ + std::string truncateAtBoundary(const std::string& text, size_t maxLength) const; }; } // End namespace diff --git a/3rdparty/libs/fileext/pdf/pdf.cpp b/3rdparty/libs/fileext/pdf/pdf.cpp index f463345..f547fef 100644 --- a/3rdparty/libs/fileext/pdf/pdf.cpp +++ b/3rdparty/libs/fileext/pdf/pdf.cpp @@ -28,6 +28,11 @@ int Pdf::convert(bool addStyle, bool extractImages, char mergingMode) { int numPage = doc->pages(); for (int i = 0; i < numPage; ++i) { + // Check if we should stop processing due to truncation + if (shouldStopProcessing()) { + break; + } + poppler::page *page = doc->create_page(i); if (page) { const auto &text = page->text(); @@ -35,7 +40,12 @@ int Pdf::convert(bool addStyle, bool extractImages, char mergingMode) { const auto strutf8 = text.to_utf8(); std::string str; str.assign(strutf8.begin(), strutf8.end()); - m_text += str; + + if (!safeAppendText(str)) { + // Truncation occurred, stop processing + delete page; + break; + } } delete page; } diff --git a/3rdparty/libs/fileext/ppt/ppt.cpp b/3rdparty/libs/fileext/ppt/ppt.cpp index f66bf34..975ebeb 100644 --- a/3rdparty/libs/fileext/ppt/ppt.cpp +++ b/3rdparty/libs/fileext/ppt/ppt.cpp @@ -136,17 +136,23 @@ void Ppt::parseRecord(const std::string &ppd, size_t &offset, int recType, ulong auto u = readByte(ppd, offset, 2); offset += 2; if (u == 0x0D || u == 0x0B) { - m_text += '\n'; + if (!safeAppendText("\n")) { + return; + } } else { if (utf16_unichar_has_4_bytes(u) && ++i < textLen) { auto b = readByte(ppd, offset, 2); offset += 2; u = (u << 16 | b); } - m_text += unichar_to_utf8(u); + if (!safeAppendText(unichar_to_utf8(u))) { + return; + } } } - m_text += '\n'; + if (!safeAppendText("\n")) { + return; + } break; } case RT_TEXT_BYTES_ATOM: { @@ -157,12 +163,19 @@ void Ppt::parseRecord(const std::string &ppd, size_t &offset, int recType, ulong for (int i = 0; i < textLen; ++i) { auto u = readByte(ppd, offset, 1); ++offset; - if (u == 0x0B || u == 0x0D) - m_text += '\n'; - else - m_text += unichar_to_utf8(u); + if (u == 0x0B || u == 0x0D) { + if (!safeAppendText("\n")) { + return; + } + } else { + if (!safeAppendText(unichar_to_utf8(u))) { + return; + } + } + } + if (!safeAppendText("\n")) { + return; } - m_text += '\n'; break; } case OFFICE_ART_SP_CONTAINER: diff --git a/3rdparty/libs/fileext/pptx/pptx.cpp b/3rdparty/libs/fileext/pptx/pptx.cpp index 8a456db..310685d 100644 --- a/3rdparty/libs/fileext/pptx/pptx.cpp +++ b/3rdparty/libs/fileext/pptx/pptx.cpp @@ -49,7 +49,10 @@ int Pptx::convert(bool addStyle, bool extractImages, char mergingMode) { Ooxml::extractFile(m_fileName, xmlName, tree); TreeWalker walker; tree.traverse(walker); - m_text += walker.content; + if (!safeAppendText(walker.content)) { + // Truncation occurred, stop processing + break; + } } return 0; diff --git a/3rdparty/libs/fileext/txt/txt.cpp b/3rdparty/libs/fileext/txt/txt.cpp index f53ab7e..37c75d4 100644 --- a/3rdparty/libs/fileext/txt/txt.cpp +++ b/3rdparty/libs/fileext/txt/txt.cpp @@ -19,10 +19,15 @@ int Txt::convert(bool addStyle, bool extractImages, char mergingMode) { std::string line; std::ifstream inputFile(m_fileName); - while (getline(inputFile, line)) - m_text += line + '\n'; + + while (getline(inputFile, line)) { + if (!safeAppendText(line + '\n')) { + // Truncation occurred, stop processing + break; + } + } + inputFile.close(); - return 0; } diff --git a/3rdparty/libs/fileext/xlsb/xlsb.cpp b/3rdparty/libs/fileext/xlsb/xlsb.cpp index 84b8341..c9fd3a0 100644 --- a/3rdparty/libs/fileext/xlsb/xlsb.cpp +++ b/3rdparty/libs/fileext/xlsb/xlsb.cpp @@ -77,6 +77,12 @@ int Xlsb::convert(bool addStyle, bool extractImages, char mergingMode) if (!parseWorkSheets(m_text)) return -1; + // Apply truncation if enabled + if (m_truncationEnabled && m_text.size() > m_maxBytes) { + m_text = truncateAtBoundary(m_text, m_maxBytes); + m_truncated = true; + } + return 0; } diff --git a/debian/changelog b/debian/changelog index 559230a..6091028 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +docparser (1.0.20) unstable; urgency=medium + + * implement truncation functionality in document parser + + -- Zhang Sheng Tue, 24 Jun 2025 14:14:45 +0800 + docparser (1.0.19) unstable; urgency=medium * integrate libmagic for MIME type detection in docparser diff --git a/src/docparser.cpp b/src/docparser.cpp index 795d464..8a68063 100644 --- a/src/docparser.cpp +++ b/src/docparser.cpp @@ -24,6 +24,7 @@ #include #include #include +#include static bool isTextSuffix(std::string_view suffix) { @@ -40,6 +41,24 @@ static bool isTextSuffix(std::string_view suffix) return validSuffixes.count(lowercaseSuffix) > 0; } +/** + * @brief Extract and normalize file extension + * @param filename Path to the file + * @return Lowercase file extension, or empty string if no extension + */ +static std::string extractFileExtension(const std::string &filename) +{ + size_t dotPos = filename.find_last_of('.'); + if (dotPos == std::string::npos || dotPos == filename.length() - 1) { + return {}; + } + + std::string suffix = filename.substr(dotPos + 1); + std::transform(suffix.begin(), suffix.end(), suffix.begin(), + [](unsigned char c) { return std::tolower(c); }); + return suffix; +} + /** * @brief Check if a file is a text file using libmagic MIME type detection * @param filename The path to the file to check @@ -99,6 +118,33 @@ static bool isTextFileByMimeType(const std::string &filename) return isText; } +/** + * @brief Get file size in bytes + * @param filename Path to the file + * @return File size in bytes, or 0 if file doesn't exist or error occurred + */ +static size_t getFileSize(const std::string &filename) +{ + struct stat stat_buf; + if (stat(filename.c_str(), &stat_buf) == 0) { + return static_cast(stat_buf.st_size); + } + return 0; +} + +/** + * @brief Check if a file is small enough to use the original processing path + * @param filename Path to the file + * @param maxBytes Maximum bytes limit + * @return true if file is small enough that doesn't need truncation + */ +static bool isSmallFile(const std::string &filename, size_t maxBytes) +{ + // Check file size first (early exit if too large) + size_t fileSize = getFileSize(filename); + return fileSize > 0 && fileSize <= maxBytes; +} + // 预处理后缀映射,避免多次strcasecmp比较 using FileCreator = std::unique_ptr (*)(const std::string &, const std::string &); @@ -196,41 +242,52 @@ static const std::unordered_map createSimilarExtension }; } -static std::string doConvertFile(const std::string &filename, std::string suffix) +/** + * @brief Create parser instance for the given file + * @param filename Path to the file + * @param suffix File extension (lowercase) + * @return Unique pointer to FileExtension instance, or nullptr if unsupported + */ +static std::unique_ptr createParser(const std::string &filename, const std::string &suffix) { static const std::unordered_map extensionMap = createExtensionMap(); - static const std::unordered_map similarExtensionMap = createSimilarExtensionMap(); + // First check if it is a text file + if (isTextSuffix(suffix)) { + return createTxt(filename, suffix); + } + + // Find the corresponding creation function + auto it = extensionMap.find(suffix); + if (it != extensionMap.end()) { + return it->second(filename, suffix); + } + + // Extension not found in map, check if it's a text file by content + std::cout << "INFO: [createParser] Unknown file extension '" << suffix + << "', checking file content for text type: " << filename << std::endl; + + if (isTextFileByMimeType(filename)) { + std::cout << "INFO: [createParser] File detected as text by MIME type analysis: " + << filename << std::endl; + return createTxt(filename, suffix); + } + + return nullptr; +} + +static std::string doConvertFile(const std::string &filename, std::string suffix) +{ // Convert suffix to lowercase std::transform(suffix.begin(), suffix.end(), suffix.begin(), [](unsigned char c) { return std::tolower(c); }); - std::unique_ptr document; + std::unique_ptr document = createParser(filename, suffix); + if (!document) { + throw std::logic_error("Unsupported file extension: " + filename); + } try { - // First check if it is a text file - if (isTextSuffix(suffix)) { - document = createTxt(filename, suffix); - } else { - // Find the corresponding creation function - auto it = extensionMap.find(suffix); - if (it != extensionMap.end()) { - document = it->second(filename, suffix); - } else { - // Extension not found in map, check if it's a text file by content - std::cout << "INFO: [doConvertFile] Unknown file extension '" << suffix - << "', checking file content for text type: " << filename << std::endl; - - if (isTextFileByMimeType(filename)) { - std::cout << "INFO: [doConvertFile] File detected as text by MIME type analysis: " - << filename << std::endl; - document = createTxt(filename, suffix); - } else { - throw std::logic_error("Unsupported file extension: " + filename); - } - } - } - document->convert(); // Use move semantics to avoid copying return std::move(document->m_text); @@ -243,16 +300,70 @@ static std::string doConvertFile(const std::string &filename, std::string suffix return {}; } -std::string DocParser::convertFile(const std::string &filename) +/** + * @brief Convert file with truncation support + * @param filename Path to the file + * @param maxBytes Maximum bytes to process + * @return Converted text content (potentially truncated) + */ +static std::string doConvertFileWithTruncation(const std::string &filename, size_t maxBytes) { - // 更高效地查找最后一个点的位置 - size_t dotPos = filename.find_last_of('.'); - if (dotPos == std::string::npos || dotPos == filename.length() - 1) + std::string suffix = extractFileExtension(filename); + if (suffix.empty()) { return {}; + } - std::string suffix = filename.substr(dotPos + 1); - std::transform(suffix.begin(), suffix.end(), suffix.begin(), - [](unsigned char c) { return std::tolower(c); }); + std::unique_ptr document = createParser(filename, suffix); + if (!document) { + // Try similar extensions + static const std::unordered_map similarExtensionMap = createSimilarExtensionMap(); + auto it = similarExtensionMap.find(suffix); + if (it != similarExtensionMap.end()) { + document = createParser(filename, it->second); + } + } + + if (!document) { + std::cerr << "ERROR: [doConvertFileWithTruncation] Unsupported file extension: " << filename << std::endl; + return {}; + } + + try { + // Set truncation limit + document->setTruncationLimit(maxBytes); + + // Convert with truncation control + document->convert(); + + // Get result and add truncation marker if needed + std::string result = std::move(document->m_text); + + // Fallback truncation: if the result still exceeds maxBytes, do final truncation + if (result.size() > maxBytes) { + result = document->applyFinalTruncation(result, maxBytes); + document->markAsTruncated(); + } + + if (document->isTruncated()) { + result += "\n[CONTENT_TRUNCATED]"; + } + + return result; + } catch (const std::logic_error &error) { + std::cout << error.what() << std::endl; + } catch (...) { + std::cerr << "Parse failed: " << filename << std::endl; + } + + return {}; +} + +std::string DocParser::convertFile(const std::string &filename) +{ + std::string suffix = extractFileExtension(filename); + if (suffix.empty()) { + return {}; + } // 尝试使用原始后缀解析 std::string content = doConvertFile(filename, suffix); @@ -260,7 +371,6 @@ std::string DocParser::convertFile(const std::string &filename) return content; // 尝试相似后缀 - static const std::unordered_map extensionMap = createExtensionMap(); static const std::unordered_map similarExtensionMap = createSimilarExtensionMap(); auto it = similarExtensionMap.find(suffix); @@ -270,3 +380,14 @@ std::string DocParser::convertFile(const std::string &filename) return {}; } + +std::string DocParser::convertFile(const std::string &filename, size_t maxBytes) +{ + // Quick check for small files - use original path for maximum compatibility + if (isSmallFile(filename, maxBytes)) { + return convertFile(filename); + } + + // Use truncation processing for all other cases + return doConvertFileWithTruncation(filename, maxBytes); +} diff --git a/src/docparser.h b/src/docparser.h index f56e482..eb27392 100644 --- a/src/docparser.h +++ b/src/docparser.h @@ -11,6 +11,7 @@ class DocParser { public: static std::string convertFile(const std::string &filename); + static std::string convertFile(const std::string &filename, size_t maxBytes); }; #endif // DOCPARSER_H diff --git a/tests/autotest.cpp b/tests/autotest.cpp index d9cb7e5..1058784 100644 --- a/tests/autotest.cpp +++ b/tests/autotest.cpp @@ -56,6 +56,13 @@ private slots: // MIME type detection tests void testMimeTypeDetection(); + // Truncation functionality tests + void testTruncationBasicFunctionality(); + void testTruncationWithSmallFiles(); + void testTruncationWithLargeFiles(); + void testTruncationBoundaryConditions(); + void testTruncationBackwardCompatibility(); + private: QString createTestFile(const QString &content, const QString &suffix = "txt"); QString createBinaryTestFile(const QByteArray &data, const QString &suffix); @@ -414,6 +421,149 @@ void DocParserAutoTest::testMimeTypeDetection() qInfo() << "INFO: [DocParserAutoTest::testMimeTypeDetection] MIME detection result length:" << result.length(); } +void DocParserAutoTest::testTruncationBasicFunctionality() +{ + qInfo() << "INFO: [DocParserAutoTest::testTruncationBasicFunctionality] Testing basic truncation functionality"; + + QString content = "This is a test file with some content that should be truncated at a specific point."; + QString testFile = createTestFile(content, "txt"); + QVERIFY(!testFile.isEmpty()); + + // Test truncation at 30 bytes + size_t maxBytes = 30; + std::string result = DocParser::convertFile(testFile.toStdString(), maxBytes); + + QVERIFY2(!result.empty(), "Truncated result should not be empty"); + QVERIFY2(result.length() <= maxBytes + 20, "Result should be approximately within truncation limit"); // Allow some margin for boundary truncation + QVERIFY2(result.find("[CONTENT_TRUNCATED]") != std::string::npos, "Result should contain truncation marker"); + + qInfo() << "INFO: [DocParserAutoTest::testTruncationBasicFunctionality] Original length:" << content.length() + << "Truncated length:" << result.length(); +} + +void DocParserAutoTest::testTruncationWithSmallFiles() +{ + qInfo() << "INFO: [DocParserAutoTest::testTruncationWithSmallFiles] Testing truncation with small files"; + + QString content = "Small file content"; + QString testFile = createTestFile(content, "txt"); + QVERIFY(!testFile.isEmpty()); + + // Set truncation limit larger than file content + size_t maxBytes = 1000; + std::string resultTruncated = DocParser::convertFile(testFile.toStdString(), maxBytes); + std::string resultOriginal = DocParser::convertFile(testFile.toStdString()); + + // Results should be identical for small files + QVERIFY2(resultTruncated == resultOriginal, "Small files should produce identical results with both methods"); + QVERIFY2(resultTruncated.find("[CONTENT_TRUNCATED]") == std::string::npos, "Small files should not be marked as truncated"); + + qInfo() << "INFO: [DocParserAutoTest::testTruncationWithSmallFiles] Both results identical, length:" << resultTruncated.length(); +} + +void DocParserAutoTest::testTruncationWithLargeFiles() +{ + qInfo() << "INFO: [DocParserAutoTest::testTruncationWithLargeFiles] Testing truncation with large files"; + + // Create a large text file (100KB) + const int largeSize = 100 * 1024; // 100KB + QString largeContent; + largeContent.reserve(largeSize); + + for (int i = 0; i < largeSize / 50; ++i) { + largeContent += QString("This is line %1 with some content to make the file large.\n").arg(i); + } + + QString largeFile = createTestFile(largeContent, "txt"); + QVERIFY(!largeFile.isEmpty()); + + // Test truncation at 10KB + size_t maxBytes = 10 * 1024; + QElapsedTimer timer; + timer.start(); + + std::string result = DocParser::convertFile(largeFile.toStdString(), maxBytes); + + qint64 elapsed = timer.elapsed(); + qInfo() << "INFO: [DocParserAutoTest::testTruncationWithLargeFiles] Truncated large file parsing took" << elapsed << "ms"; + + QVERIFY2(!result.empty(), "Large file truncation should produce result"); + QVERIFY2(result.length() <= maxBytes + 100, "Truncated result should be within reasonable bounds"); // Allow margin for boundary and marker + QVERIFY2(result.find("[CONTENT_TRUNCATED]") != std::string::npos, "Large file should be marked as truncated"); + QVERIFY2(elapsed < 5000, "Truncated parsing should be faster than full parsing"); // Performance check + + qInfo() << "INFO: [DocParserAutoTest::testTruncationWithLargeFiles] Original size:" << largeContent.length() + << "Truncated size:" << result.length(); +} + +void DocParserAutoTest::testTruncationBoundaryConditions() +{ + qInfo() << "INFO: [DocParserAutoTest::testTruncationBoundaryConditions] Testing truncation boundary conditions"; + + QString content = "This is a test. This is another sentence! And this is the third one?"; + QString testFile = createTestFile(content, "txt"); + QVERIFY(!testFile.isEmpty()); + + // Test various boundary conditions + QList testSizes = {0, 1, 10, 20, 30, 100}; + + for (size_t maxBytes : testSizes) { + std::string result = DocParser::convertFile(testFile.toStdString(), maxBytes); + + if (maxBytes == 0) { + // Zero bytes limit should produce empty result or just the truncation marker + bool isValidZeroResult = result.empty() || + result == "\n[CONTENT_TRUNCATED]" || + result == "[CONTENT_TRUNCATED]"; + QVERIFY2(isValidZeroResult, qPrintable(QString("Zero bytes should produce empty or marker-only result, got: '%1'").arg(QString::fromStdString(result)))); + } else if (maxBytes >= content.length()) { + // Should not be truncated + QVERIFY2(result.find("[CONTENT_TRUNCATED]") == std::string::npos, + qPrintable(QString("Content smaller than limit (%1) should not be truncated").arg(maxBytes))); + } else { + // Should be truncated + QVERIFY2(result.find("[CONTENT_TRUNCATED]") != std::string::npos, + qPrintable(QString("Content larger than limit (%1) should be truncated").arg(maxBytes))); + } + + qInfo() << "INFO: [DocParserAutoTest::testTruncationBoundaryConditions] Limit:" << maxBytes + << "Result length:" << result.length(); + } +} + +void DocParserAutoTest::testTruncationBackwardCompatibility() +{ + qInfo() << "INFO: [DocParserAutoTest::testTruncationBackwardCompatibility] Testing backward compatibility"; + + QStringList testContents = { + "Simple text content", + "Multi-line\ncontent\nwith\nseveral\nlines", + "Content with special characters: äöü 中文 🚀", + "" // Empty content + }; + + QStringList extensions = {"txt", "md", "html", "json"}; + + for (const QString& content : testContents) { + for (const QString& ext : extensions) { + QString testFile = createTestFile(content, ext); + if (testFile.isEmpty()) continue; + + // Test that original method still works exactly as before + std::string resultOriginal = DocParser::convertFile(testFile.toStdString()); + + // Test that new method with large limit produces identical results + std::string resultWithLimit = DocParser::convertFile(testFile.toStdString(), 1000000); // 1MB limit + + QVERIFY2(resultOriginal == resultWithLimit, + qPrintable(QString("Backward compatibility failed for %1 file with content length %2") + .arg(ext).arg(content.length()))); + } + } + + qInfo() << "INFO: [DocParserAutoTest::testTruncationBackwardCompatibility] All backward compatibility tests passed"; +} + QString DocParserAutoTest::createTestFile(const QString &content, const QString &suffix) { QString fileName = m_tempDir->path() + QString("/test_file_%1.%2").arg(QRandomGenerator::global()->generate()).arg(suffix); diff --git a/tests/main.cpp b/tests/main.cpp index 4d05fec..b91e7e7 100644 --- a/tests/main.cpp +++ b/tests/main.cpp @@ -34,6 +34,7 @@ int main(int argc, char **argv) // Get positional arguments const QStringList args = parser.positionalArguments(); + int truncCount = 0; // Check if a file path was provided if (args.isEmpty()) { @@ -45,6 +46,11 @@ int main(int argc, char **argv) // Get the file path QString filePath = args.first(); + // Get the tunc count + if (args.count() == 2) { + truncCount = args[1].toInt(); + } + // Check if the file exists QFileInfo fileInfo(filePath); if (!fileInfo.exists()) { @@ -63,7 +69,8 @@ int main(int argc, char **argv) try { // Call the docparser library to parse the file - std::string content = DocParser::convertFile(filePath.toStdString()); + std::string content = truncCount > 0 ? DocParser::convertFile(filePath.toStdString(), truncCount) + : DocParser::convertFile(filePath.toStdString()); // Convert encoding if (fromEncoding.toLower() != "utf-8") {