From ab5f0e3744d2d343af51b7d5d3f45c01bc21bb62 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Mon, 24 Mar 2025 15:10:27 +0800 Subject: [PATCH 1/3] chore: add tests project add tests project Log: --- CMakeLists.txt | 8 ++++ tests/CMakeLists.txt | 29 ++++++++++++++ tests/main.cpp | 92 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+) create mode 100644 tests/CMakeLists.txt create mode 100644 tests/main.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 3002d9e..13be557 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,3 +26,11 @@ pkg_check_modules(DEPS REQUIRED # 添加子目录 add_subdirectory(src) + +# 添加测试选项,默认不构建 +option(BUILD_TESTS "Build test applications" OFF) + +# 有条件地添加测试目录 +if(BUILD_TESTS) + add_subdirectory(tests) +endif() diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000..5546901 --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,29 @@ +find_package(Qt6 REQUIRED COMPONENTS Core) +find_package(Dtk6 REQUIRED COMPONENTS Core) + +# 设置测试程序的源文件 +set(TEST_SOURCES + main.cpp +) + +# 创建测试可执行文件 +add_executable(docparser_test ${TEST_SOURCES}) + +# 链接docparser库和Qt6Core +target_link_libraries(docparser_test + PRIVATE + docparser + Qt6::Core + Dtk6::Core +) + +# 设置包含目录 +target_include_directories(docparser_test + PRIVATE + ${CMAKE_SOURCE_DIR}/src +) + +# 安装测试程序(可选) +install(TARGETS docparser_test + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) diff --git a/tests/main.cpp b/tests/main.cpp new file mode 100644 index 0000000..631c1bb --- /dev/null +++ b/tests/main.cpp @@ -0,0 +1,92 @@ +// SPDX-FileCopyrightText: 2024 UnionTech Software Technology Co., Ltd. +// +// SPDX-License-Identifier: LGPL-3.0-or-later + +#include "docparser.h" +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char **argv) +{ + QCoreApplication app(argc, argv); + QCoreApplication::setApplicationName("docparser_test"); + QCoreApplication::setApplicationVersion("1.0"); + + // Setup command line parser + QCommandLineParser parser; + parser.setApplicationDescription("Document parser test application"); + parser.addHelpOption(); + parser.addVersionOption(); + + // Add file path argument + parser.addPositionalArgument("file", "The file to parse"); + + // Process the command line arguments + parser.process(app); + + // Get positional arguments + const QStringList args = parser.positionalArguments(); + + // Check if a file path was provided + if (args.isEmpty()) { + qCritical() << "Error: A file path must be provided as an argument"; + parser.showHelp(1); + return 1; + } + + // Get the file path + QString filePath = args.first(); + + // Check if the file exists + QFileInfo fileInfo(filePath); + if (!fileInfo.exists()) { + qCritical() << "Error: File" << filePath << "does not exist"; + return 1; + } + + auto fromEncoding = Dtk::Core::DTextEncoding::detectFileEncoding(filePath); + // Print file information + qInfo() << "Parsing file:" << filePath << "From encoding: " << fromEncoding; + + try { + // Call the docparser library to parse the file + std::string content = DocParser::convertFile(filePath.toStdString()); + + // Convert encoding + if (fromEncoding.toLower() != "utf-8") { + auto in = QByteArray::fromStdString(content); + QByteArray out; + Dtk::Core::DTextEncoding::convertTextEncodingEx(in, out, "utf-8", fromEncoding); + content = out.toStdString(); + } + + // Check the parsing result + if (content.empty()) { + qWarning() << "Warning: No content was extracted from the file"; + return 0; + } + + // Print the parsed content + QTextStream out(stdout); + out << "\n=========== File Content ===========\n\n"; + out << QString::fromStdString(content) << "\n"; + out << "\n=================================\n"; + + // Print statistics + qInfo() << "Parsing successful, extracted" << content.length() << "characters"; + + } catch (const std::exception &e) { + qCritical() << "An error occurred during parsing:" << e.what(); + return 1; + } catch (...) { + qCritical() << "An unknown error occurred during parsing"; + return 1; + } + + return 0; +} From 81da63ee31c49d83aab4bf521c310799ed529409 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Mon, 24 Mar 2025 17:01:11 +0800 Subject: [PATCH 2/3] refactor: enhance docparser with suffix handling and file creation - Introduced a mapping of file extensions to their respective creation functions to streamline document handling. - Replaced the previous suffix validation logic with a static set of valid text suffixes. - Improved suffix handling by transforming to lowercase and added support for similar suffixes. - Refactored the file conversion logic to utilize the new mapping for better maintainability and readability. Log: This update enhances the flexibility and efficiency of the document parsing process. --- src/docparser.cpp | 197 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 136 insertions(+), 61 deletions(-) diff --git a/src/docparser.cpp b/src/docparser.cpp index 7c9c8af..91274cd 100644 --- a/src/docparser.cpp +++ b/src/docparser.cpp @@ -20,97 +20,172 @@ #include #include #include +#include +#include +#include -static bool isValidSuffix(const std::unordered_set &supportedSuffixs, const std::string &suffix) +static bool isTextSuffix(std::string_view suffix) { + static const std::unordered_set validSuffixes = { + "txt", "text", "md", "markdown", "sh", "html", "htm", + "xml", "xhtml", "dhtml", "shtm", "shtml", "json", + "css", "yaml", "ini", "bat", "js", "sql", "uof" + }; + std::string lowercaseSuffix(suffix); - std::transform(lowercaseSuffix.begin(), lowercaseSuffix.end(), lowercaseSuffix.begin(), ::tolower); + std::transform(lowercaseSuffix.begin(), lowercaseSuffix.end(), lowercaseSuffix.begin(), + [](unsigned char c) { return std::tolower(c); }); - return supportedSuffixs.count(lowercaseSuffix) > 0; + return validSuffixes.count(lowercaseSuffix) > 0; } -static bool isTextSuffix(const std::string &suffix) +// 预处理后缀映射,避免多次strcasecmp比较 +using FileCreator = std::unique_ptr (*)(const std::string &, const std::string &); + +static std::unique_ptr createDocx(const std::string &filename, const std::string &) { - static const std::unordered_set validSuffixes = { - "txt", "text", "md", "markdown", "sh", "html", "htm", - "xml", "xhtml", "dhtml", "shtm", "shtml", "json", - "css", "yaml", "ini", "bat", "js", "sql", "uof" - }; - return isValidSuffix(validSuffixes, suffix); + return std::make_unique(filename); +} + +static std::unique_ptr createPptx(const std::string &filename, const std::string &) +{ + return std::make_unique(filename); +} + +static std::unique_ptr createTxt(const std::string &filename, const std::string &) +{ + return std::make_unique(filename); +} + +static std::unique_ptr createDoc(const std::string &filename, const std::string &) +{ + return std::make_unique(filename); +} + +static std::unique_ptr createRtf(const std::string &filename, const std::string &) +{ + return std::make_unique(filename); +} + +static std::unique_ptr createOdf(const std::string &filename, const std::string &) +{ + return std::make_unique(filename); +} + +static std::unique_ptr createExcel(const std::string &filename, const std::string &suffix) +{ + return std::make_unique(filename, suffix); +} + +static std::unique_ptr createXlsb(const std::string &filename, const std::string &) +{ + return std::make_unique(filename); +} + +static std::unique_ptr createPpt(const std::string &filename, const std::string &) +{ + return std::make_unique(filename); +} + +static std::unique_ptr createPdf(const std::string &filename, const std::string &) +{ + return std::make_unique(filename); +} + +static std::unique_ptr createOfd(const std::string &filename, const std::string &) +{ + return std::make_unique(filename); } -static std::string doConvertFile(const std::string &filename, const std::string &suffix) + +// 缓存后缀到创建函数的映射 +static const std::unordered_map extensionMap = { + { "docx", createDocx }, + { "pptx", createPptx }, + { "ppsx", createPptx }, + { "doc", createDoc }, + { "dot", createDoc }, + { "wps", createDoc }, + { "rtf", createRtf }, + { "odg", createOdf }, + { "odt", createOdf }, + { "ods", createOdf }, + { "odp", createOdf }, + { "xls", createExcel }, + { "xlsx", createExcel }, + { "xlsb", createXlsb }, + { "ppt", createPpt }, + { "pps", createPpt }, + { "dps", createPpt }, + { "pot", createPpt }, + { "pdf", createPdf }, + { "ofd", createOfd } +}; + +// 缓存相似后缀的映射关系 +static const std::unordered_map similarExtensionMap = { + { "doc", "docx" }, + { "docx", "doc" }, + { "xls", "xlsx" }, + { "xlsx", "xls" }, + { "ppt", "pptx" }, + { "pptx", "ppt" } +}; + +static std::string doConvertFile(const std::string &filename, std::string suffix) { - std::string content; + // 转换后缀为小写 + std::transform(suffix.begin(), suffix.end(), suffix.begin(), + [](unsigned char c) { return std::tolower(c); }); + std::unique_ptr document; + try { - // 比较后缀名,不区分大小写 - if (!strcasecmp(suffix.c_str(), "docx")) { - document.reset(new docx::Docx(filename)); - } else if (!strcasecmp(suffix.c_str(), "pptx") || !strcasecmp(suffix.c_str(), "ppsx")) { - document.reset(new pptx::Pptx(filename)); - } else if (isTextSuffix(suffix)) { - document.reset(new txt::Txt(filename)); - } else if (!strcasecmp(suffix.c_str(), "doc") || !strcasecmp(suffix.c_str(), "dot") || !strcasecmp(suffix.c_str(), "wps")) { - document.reset(new doc::Doc(filename)); - } else if (!strcasecmp(suffix.c_str(), "rtf")) { - document.reset(new rtf::Rtf(filename)); - } else if (!strcasecmp(suffix.c_str(), "odg") || !strcasecmp(suffix.c_str(), "odt") || !strcasecmp(suffix.c_str(), "ods") || !strcasecmp(suffix.c_str(), "odp")) { - document.reset(new odf::Odf(filename)); - } else if (!strcasecmp(suffix.c_str(), "xls") || !strcasecmp(suffix.c_str(), "xlsx")) { - document.reset(new excel::Excel(filename, suffix)); - } else if (!strcasecmp(suffix.c_str(), "xlsb")) { - document.reset(new xlsb::Xlsb(filename)); - } else if (!strcasecmp(suffix.c_str(), "ppt") || !strcasecmp(suffix.c_str(), "pps") || !strcasecmp(suffix.c_str(), "dps") || !strcasecmp(suffix.c_str(), "pot")) { - document.reset(new ppt::Ppt(filename)); - } else if (!strcasecmp(suffix.c_str(), "pdf")) { - document.reset(new pdf::Pdf(filename)); - } else if (!strcasecmp(suffix.c_str(), "ofd")) { - document.reset(new ofd::Ofd(filename)); + // 先检查是否是文本文件 + if (isTextSuffix(suffix)) { + document = createTxt(filename, suffix); } else { - throw std::logic_error("Unsupported file extension: " + filename); + // 查找对应的创建函数 + auto it = extensionMap.find(suffix); + if (it != extensionMap.end()) { + document = it->second(filename, suffix); + } else { + throw std::logic_error("Unsupported file extension: " + filename); + } } document->convert(); - content = document->m_text; + // 使用移动语义避免复制 + return std::move(document->m_text); } catch (const std::logic_error &error) { std::cout << error.what() << std::endl; } catch (...) { std::cerr << "Parse failed: " << filename << std::endl; } - document.reset(); - return content; + return {}; } std::string DocParser::convertFile(const std::string &filename) { - std::string suffix = filename.substr(filename.find_last_of('.') + 1); - if (suffix.empty()) + // 更高效地查找最后一个点的位置 + size_t dotPos = filename.find_last_of('.'); + if (dotPos == std::string::npos || dotPos == filename.length() - 1) return {}; - // 已解析出内容,直接返回结果 - const auto &content { doConvertFile(filename, suffix) }; + std::string suffix = filename.substr(dotPos + 1); + std::transform(suffix.begin(), suffix.end(), suffix.begin(), + [](unsigned char c) { return std::tolower(c); }); + + // 尝试使用原始后缀解析 + std::string content = doConvertFile(filename, suffix); if (!content.empty()) return content; - // 对于解析失败的情况,再进行相似后缀的尝试 - // doc <-> docx - if (!strcasecmp(suffix.c_str(), "doc")) - return doConvertFile(filename, "docx"); - if (!strcasecmp(suffix.c_str(), "docx")) - return doConvertFile(filename, "doc"); - - // xls <-> xlsx - if (!strcasecmp(suffix.c_str(), "xls")) - return doConvertFile(filename, "xlsx"); - if (!strcasecmp(suffix.c_str(), "xlsx")) - return doConvertFile(filename, "xls"); - - // ppt <-> pptx - if (!strcasecmp(suffix.c_str(), "ppt")) - return doConvertFile(filename, "pptx"); - if (!strcasecmp(suffix.c_str(), "pptx")) - return doConvertFile(filename, "ppt"); + // 尝试相似后缀 + auto it = similarExtensionMap.find(suffix); + if (it != similarExtensionMap.end()) { + return doConvertFile(filename, it->second); + } return {}; } From 00c09d86bb419b238b130f4af1b420c574564aa1 Mon Sep 17 00:00:00 2001 From: Zhang Sheng Date: Mon, 24 Mar 2025 17:29:11 +0800 Subject: [PATCH 3/3] chore: bump version to 1.0.17 1.0.17 Log: --- debian/changelog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/debian/changelog b/debian/changelog index f74d588..ce0643a 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +docparser (1.0.17) unstable; urgency=medium + + * add tests + * refactor + + -- Zhang Sheng Mon, 24 Mar 2025 17:28:27 +0800 + docparser (1.0.16) unstable; urgency=medium * remove symbol visibility settings