From 9e462dcc1bc513d25692e42b07b6f64ea9cd9b80 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Fri, 28 Nov 2025 16:13:08 +0800 Subject: [PATCH] fix: improve file conversion fallback logic for truncated files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactored the file conversion logic to: 1. Split the truncation handling into two separate functions 2. First attempt conversion with original file extension 3. Only fallback to similar extensions if primary conversion fails 4. Removed duplicate error logging and simplified control flow 5. Improved code organization and separation of concerns The changes ensure proper fallback behavior when processing files with different extensions and maintains cleaner error handling. Previously, the code would unconditionally try similar extensions even when the primary conversion succeeded. Influence: 1. Test with files having correct extensions that should convert successfully 2. Test with files needing fallback to similar extensions 3. Verify truncation behavior works correctly for both cases 4. Check error cases where neither primary nor fallback extensions work 5. Validate truncation markers are added correctly fix: 改进截断文件转换的回退逻辑 重构文件转换逻辑: 1. 将截断处理拆分为两个独立函数 2. 首先尝试使用原始文件扩展名进行转换 3. 仅当主转换失败时才回退到类似扩展名 4. 移除重复的错误日志记录并简化控制流 5. 改进代码组织和关注点分离 这些改动确保在处理不同扩展名文件时有正确的回退行为,并保持更清晰的错误处 理。之前的代码会在主转换成功后仍无条件尝试类似扩展名。 Influence: 1. 测试使用应能成功转换的正确扩展名文件 2. 测试需要回退到类似扩展名的文件 3. 验证两种情况的截断行为是否正确 4. 检查主转换和回退扩展名都无效的错误情况 5. 验证截断标记是否正确添加 --- debian/changelog | 6 +++++ src/docparser.cpp | 57 ++++++++++++++++++++++++++++++----------------- 2 files changed, 42 insertions(+), 21 deletions(-) diff --git a/debian/changelog b/debian/changelog index d8942c8..76754e5 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +docparser (1.0.24) unstable; urgency=medium + + * improve file conversion fallback logic for truncated files + + -- Zhang Sheng Fri, 28 Nov 2025 16:14:02 +0800 + docparser (1.0.23) unstable; urgency=medium * add defensive checks in excel formula evaluation diff --git a/src/docparser.cpp b/src/docparser.cpp index 8a68063..edeec88 100644 --- a/src/docparser.cpp +++ b/src/docparser.cpp @@ -301,30 +301,16 @@ static std::string doConvertFile(const std::string &filename, std::string suffix } /** - * @brief Convert file with truncation support + * @brief Helper function to convert file with truncation for a given suffix * @param filename Path to the file + * @param suffix File extension to use * @param maxBytes Maximum bytes to process - * @return Converted text content (potentially truncated) + * @return Converted text content (potentially truncated), or empty string if failed */ -static std::string doConvertFileWithTruncation(const std::string &filename, size_t maxBytes) +static std::string tryConvertWithTruncation(const std::string &filename, const std::string &suffix, size_t maxBytes) { - std::string suffix = extractFileExtension(filename); - if (suffix.empty()) { - return {}; - } - std::unique_ptr document = createParser(filename, suffix); if (!document) { - // Try similar extensions - static const std::unordered_map similarExtensionMap = createSimilarExtensionMap(); - auto it = similarExtensionMap.find(suffix); - if (it != similarExtensionMap.end()) { - document = createParser(filename, it->second); - } - } - - if (!document) { - std::cerr << "ERROR: [doConvertFileWithTruncation] Unsupported file extension: " << filename << std::endl; return {}; } @@ -337,17 +323,17 @@ static std::string doConvertFileWithTruncation(const std::string &filename, size // Get result and add truncation marker if needed std::string result = std::move(document->m_text); - + // Fallback truncation: if the result still exceeds maxBytes, do final truncation if (result.size() > maxBytes) { result = document->applyFinalTruncation(result, maxBytes); document->markAsTruncated(); } - + if (document->isTruncated()) { result += "\n[CONTENT_TRUNCATED]"; } - + return result; } catch (const std::logic_error &error) { std::cout << error.what() << std::endl; @@ -358,6 +344,35 @@ static std::string doConvertFileWithTruncation(const std::string &filename, size return {}; } +/** + * @brief Convert file with truncation support + * @param filename Path to the file + * @param maxBytes Maximum bytes to process + * @return Converted text content (potentially truncated) + */ +static std::string doConvertFileWithTruncation(const std::string &filename, size_t maxBytes) +{ + std::string suffix = extractFileExtension(filename); + if (suffix.empty()) { + return {}; + } + + // Try converting with the original suffix first + std::string content = tryConvertWithTruncation(filename, suffix, maxBytes); + if (!content.empty()) { + return content; + } + + // If content is empty, try similar extensions + static const std::unordered_map similarExtensionMap = createSimilarExtensionMap(); + auto it = similarExtensionMap.find(suffix); + if (it != similarExtensionMap.end()) { + return tryConvertWithTruncation(filename, it->second, maxBytes); + } + + return {}; +} + std::string DocParser::convertFile(const std::string &filename) { std::string suffix = extractFileExtension(filename);