Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,11 @@ pkg_check_modules(DEPS REQUIRED

# 添加子目录
add_subdirectory(src)

# 添加测试选项,默认不构建
option(BUILD_TESTS "Build test applications" OFF)

# 有条件地添加测试目录
if(BUILD_TESTS)
add_subdirectory(tests)
endif()
7 changes: 7 additions & 0 deletions debian/changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
docparser (1.0.17) unstable; urgency=medium

* add tests
* refactor

-- Zhang Sheng <zhangsheng@uniontech.com> Mon, 24 Mar 2025 17:28:27 +0800

docparser (1.0.16) unstable; urgency=medium

* remove symbol visibility settings
Expand Down
197 changes: 136 additions & 61 deletions src/docparser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,100 +17,175 @@
#include "fileext/xlsb/xlsb.h"

#include <memory>
#include <iostream>

Check warning on line 20 in src/docparser.cpp

View workflow job for this annotation

GitHub Actions / cppcheck

Include file: <iostream> not found. Please note: Cppcheck does not need standard library headers to get proper results.
#include <cstring>

Check warning on line 21 in src/docparser.cpp

View workflow job for this annotation

GitHub Actions / cppcheck

Include file: <cstring> not found. Please note: Cppcheck does not need standard library headers to get proper results.
#include <unordered_set>

Check warning on line 22 in src/docparser.cpp

View workflow job for this annotation

GitHub Actions / cppcheck

Include file: <unordered_set> not found. Please note: Cppcheck does not need standard library headers to get proper results.
#include <unordered_map>

Check warning on line 23 in src/docparser.cpp

View workflow job for this annotation

GitHub Actions / cppcheck

Include file: <unordered_map> not found. Please note: Cppcheck does not need standard library headers to get proper results.
#include <string_view>

Check warning on line 24 in src/docparser.cpp

View workflow job for this annotation

GitHub Actions / cppcheck

Include file: <string_view> not found. Please note: Cppcheck does not need standard library headers to get proper results.
#include <algorithm>

Check warning on line 25 in src/docparser.cpp

View workflow job for this annotation

GitHub Actions / cppcheck

Include file: <algorithm> not found. Please note: Cppcheck does not need standard library headers to get proper results.

static bool isValidSuffix(const std::unordered_set<std::string> &supportedSuffixs, const std::string &suffix)
static bool isTextSuffix(std::string_view suffix)
{
static const std::unordered_set<std::string_view> validSuffixes = {
"txt", "text", "md", "markdown", "sh", "html", "htm",
"xml", "xhtml", "dhtml", "shtm", "shtml", "json",
"css", "yaml", "ini", "bat", "js", "sql", "uof"
};

std::string lowercaseSuffix(suffix);
std::transform(lowercaseSuffix.begin(), lowercaseSuffix.end(), lowercaseSuffix.begin(), ::tolower);
std::transform(lowercaseSuffix.begin(), lowercaseSuffix.end(), lowercaseSuffix.begin(),
[](unsigned char c) { return std::tolower(c); });

return supportedSuffixs.count(lowercaseSuffix) > 0;
return validSuffixes.count(lowercaseSuffix) > 0;
}

static bool isTextSuffix(const std::string &suffix)
// 预处理后缀映射,避免多次strcasecmp比较
using FileCreator = std::unique_ptr<fileext::FileExtension> (*)(const std::string &, const std::string &);

static std::unique_ptr<fileext::FileExtension> createDocx(const std::string &filename, const std::string &)
{
static const std::unordered_set<std::string> validSuffixes = {
"txt", "text", "md", "markdown", "sh", "html", "htm",
"xml", "xhtml", "dhtml", "shtm", "shtml", "json",
"css", "yaml", "ini", "bat", "js", "sql", "uof"
};
return isValidSuffix(validSuffixes, suffix);
return std::make_unique<docx::Docx>(filename);
}

static std::unique_ptr<fileext::FileExtension> createPptx(const std::string &filename, const std::string &)
{
return std::make_unique<pptx::Pptx>(filename);
}

static std::unique_ptr<fileext::FileExtension> createTxt(const std::string &filename, const std::string &)
{
return std::make_unique<txt::Txt>(filename);
}

static std::unique_ptr<fileext::FileExtension> createDoc(const std::string &filename, const std::string &)
{
return std::make_unique<doc::Doc>(filename);
}

static std::unique_ptr<fileext::FileExtension> createRtf(const std::string &filename, const std::string &)
{
return std::make_unique<rtf::Rtf>(filename);
}

static std::unique_ptr<fileext::FileExtension> createOdf(const std::string &filename, const std::string &)
{
return std::make_unique<odf::Odf>(filename);
}

static std::unique_ptr<fileext::FileExtension> createExcel(const std::string &filename, const std::string &suffix)
{
return std::make_unique<excel::Excel>(filename, suffix);
}

static std::unique_ptr<fileext::FileExtension> createXlsb(const std::string &filename, const std::string &)
{
return std::make_unique<xlsb::Xlsb>(filename);
}

static std::unique_ptr<fileext::FileExtension> createPpt(const std::string &filename, const std::string &)
{
return std::make_unique<ppt::Ppt>(filename);
}

static std::unique_ptr<fileext::FileExtension> createPdf(const std::string &filename, const std::string &)
{
return std::make_unique<pdf::Pdf>(filename);
}

static std::unique_ptr<fileext::FileExtension> createOfd(const std::string &filename, const std::string &)
{
return std::make_unique<ofd::Ofd>(filename);
}
static std::string doConvertFile(const std::string &filename, const std::string &suffix)

// 缓存后缀到创建函数的映射
static const std::unordered_map<std::string, FileCreator> extensionMap = {
{ "docx", createDocx },
{ "pptx", createPptx },
{ "ppsx", createPptx },
{ "doc", createDoc },
{ "dot", createDoc },
{ "wps", createDoc },
{ "rtf", createRtf },
{ "odg", createOdf },
{ "odt", createOdf },
{ "ods", createOdf },
{ "odp", createOdf },
{ "xls", createExcel },
{ "xlsx", createExcel },
{ "xlsb", createXlsb },
{ "ppt", createPpt },
{ "pps", createPpt },
{ "dps", createPpt },
{ "pot", createPpt },
{ "pdf", createPdf },
{ "ofd", createOfd }
};

// 缓存相似后缀的映射关系
static const std::unordered_map<std::string, std::string> similarExtensionMap = {
{ "doc", "docx" },
{ "docx", "doc" },
{ "xls", "xlsx" },
{ "xlsx", "xls" },
{ "ppt", "pptx" },
{ "pptx", "ppt" }
};

static std::string doConvertFile(const std::string &filename, std::string suffix)
{
std::string content;
// 转换后缀为小写
std::transform(suffix.begin(), suffix.end(), suffix.begin(),
[](unsigned char c) { return std::tolower(c); });

std::unique_ptr<fileext::FileExtension> document;

try {
// 比较后缀名,不区分大小写
if (!strcasecmp(suffix.c_str(), "docx")) {
document.reset(new docx::Docx(filename));
} else if (!strcasecmp(suffix.c_str(), "pptx") || !strcasecmp(suffix.c_str(), "ppsx")) {
document.reset(new pptx::Pptx(filename));
} else if (isTextSuffix(suffix)) {
document.reset(new txt::Txt(filename));
} else if (!strcasecmp(suffix.c_str(), "doc") || !strcasecmp(suffix.c_str(), "dot") || !strcasecmp(suffix.c_str(), "wps")) {
document.reset(new doc::Doc(filename));
} else if (!strcasecmp(suffix.c_str(), "rtf")) {
document.reset(new rtf::Rtf(filename));
} else if (!strcasecmp(suffix.c_str(), "odg") || !strcasecmp(suffix.c_str(), "odt") || !strcasecmp(suffix.c_str(), "ods") || !strcasecmp(suffix.c_str(), "odp")) {
document.reset(new odf::Odf(filename));
} else if (!strcasecmp(suffix.c_str(), "xls") || !strcasecmp(suffix.c_str(), "xlsx")) {
document.reset(new excel::Excel(filename, suffix));
} else if (!strcasecmp(suffix.c_str(), "xlsb")) {
document.reset(new xlsb::Xlsb(filename));
} else if (!strcasecmp(suffix.c_str(), "ppt") || !strcasecmp(suffix.c_str(), "pps") || !strcasecmp(suffix.c_str(), "dps") || !strcasecmp(suffix.c_str(), "pot")) {
document.reset(new ppt::Ppt(filename));
} else if (!strcasecmp(suffix.c_str(), "pdf")) {
document.reset(new pdf::Pdf(filename));
} else if (!strcasecmp(suffix.c_str(), "ofd")) {
document.reset(new ofd::Ofd(filename));
// 先检查是否是文本文件
if (isTextSuffix(suffix)) {
document = createTxt(filename, suffix);
} else {
throw std::logic_error("Unsupported file extension: " + filename);
// 查找对应的创建函数
auto it = extensionMap.find(suffix);
if (it != extensionMap.end()) {
document = it->second(filename, suffix);
} else {
throw std::logic_error("Unsupported file extension: " + filename);
}
}

document->convert();
content = document->m_text;
// 使用移动语义避免复制
return std::move(document->m_text);
} catch (const std::logic_error &error) {
std::cout << error.what() << std::endl;
} catch (...) {
std::cerr << "Parse failed: " << filename << std::endl;
}

document.reset();
return content;
return {};
}

std::string DocParser::convertFile(const std::string &filename)
{
std::string suffix = filename.substr(filename.find_last_of('.') + 1);
if (suffix.empty())
// 更高效地查找最后一个点的位置
size_t dotPos = filename.find_last_of('.');
if (dotPos == std::string::npos || dotPos == filename.length() - 1)
return {};

// 已解析出内容,直接返回结果
const auto &content { doConvertFile(filename, suffix) };
std::string suffix = filename.substr(dotPos + 1);
std::transform(suffix.begin(), suffix.end(), suffix.begin(),
[](unsigned char c) { return std::tolower(c); });

// 尝试使用原始后缀解析
std::string content = doConvertFile(filename, suffix);
if (!content.empty())
return content;

// 对于解析失败的情况,再进行相似后缀的尝试
// doc <-> docx
if (!strcasecmp(suffix.c_str(), "doc"))
return doConvertFile(filename, "docx");
if (!strcasecmp(suffix.c_str(), "docx"))
return doConvertFile(filename, "doc");

// xls <-> xlsx
if (!strcasecmp(suffix.c_str(), "xls"))
return doConvertFile(filename, "xlsx");
if (!strcasecmp(suffix.c_str(), "xlsx"))
return doConvertFile(filename, "xls");

// ppt <-> pptx
if (!strcasecmp(suffix.c_str(), "ppt"))
return doConvertFile(filename, "pptx");
if (!strcasecmp(suffix.c_str(), "pptx"))
return doConvertFile(filename, "ppt");
// 尝试相似后缀
auto it = similarExtensionMap.find(suffix);
if (it != similarExtensionMap.end()) {
return doConvertFile(filename, it->second);
}

return {};
}
29 changes: 29 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
find_package(Qt6 REQUIRED COMPONENTS Core)
find_package(Dtk6 REQUIRED COMPONENTS Core)

# 设置测试程序的源文件
set(TEST_SOURCES
main.cpp
)

# 创建测试可执行文件
add_executable(docparser_test ${TEST_SOURCES})

# 链接docparser库和Qt6Core
target_link_libraries(docparser_test
PRIVATE
docparser
Qt6::Core
Dtk6::Core
)

# 设置包含目录
target_include_directories(docparser_test
PRIVATE
${CMAKE_SOURCE_DIR}/src
)

# 安装测试程序(可选)
install(TARGETS docparser_test
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
)
92 changes: 92 additions & 0 deletions tests/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// SPDX-FileCopyrightText: 2024 UnionTech Software Technology Co., Ltd.
//
// SPDX-License-Identifier: LGPL-3.0-or-later

#include "docparser.h"

Check warning on line 5 in tests/main.cpp

View workflow job for this annotation

GitHub Actions / cppcheck

Include file: "docparser.h" not found.
#include <QCoreApplication>

Check warning on line 6 in tests/main.cpp

View workflow job for this annotation

GitHub Actions / cppcheck

Include file: <QCoreApplication> not found. Please note: Cppcheck does not need standard library headers to get proper results.
#include <QCommandLineParser>

Check warning on line 7 in tests/main.cpp

View workflow job for this annotation

GitHub Actions / cppcheck

Include file: <QCommandLineParser> not found. Please note: Cppcheck does not need standard library headers to get proper results.
#include <QCommandLineOption>

Check warning on line 8 in tests/main.cpp

View workflow job for this annotation

GitHub Actions / cppcheck

Include file: <QCommandLineOption> not found. Please note: Cppcheck does not need standard library headers to get proper results.
#include <QFileInfo>
#include <QDebug>
#include <QTextStream>
#include <DTextEncoding>

int main(int argc, char **argv)
{
QCoreApplication app(argc, argv);
QCoreApplication::setApplicationName("docparser_test");
QCoreApplication::setApplicationVersion("1.0");

// Setup command line parser
QCommandLineParser parser;
parser.setApplicationDescription("Document parser test application");
parser.addHelpOption();
parser.addVersionOption();

// Add file path argument
parser.addPositionalArgument("file", "The file to parse");

// Process the command line arguments
parser.process(app);

// Get positional arguments
const QStringList args = parser.positionalArguments();

// Check if a file path was provided
if (args.isEmpty()) {
qCritical() << "Error: A file path must be provided as an argument";
parser.showHelp(1);
return 1;
}

// Get the file path
QString filePath = args.first();

// Check if the file exists
QFileInfo fileInfo(filePath);
if (!fileInfo.exists()) {
qCritical() << "Error: File" << filePath << "does not exist";
return 1;
}

auto fromEncoding = Dtk::Core::DTextEncoding::detectFileEncoding(filePath);
// Print file information
qInfo() << "Parsing file:" << filePath << "From encoding: " << fromEncoding;

try {
// Call the docparser library to parse the file
std::string content = DocParser::convertFile(filePath.toStdString());

// Convert encoding
if (fromEncoding.toLower() != "utf-8") {
auto in = QByteArray::fromStdString(content);
QByteArray out;
Dtk::Core::DTextEncoding::convertTextEncodingEx(in, out, "utf-8", fromEncoding);
content = out.toStdString();
}

// Check the parsing result
if (content.empty()) {
qWarning() << "Warning: No content was extracted from the file";
return 0;
}

// Print the parsed content
QTextStream out(stdout);
out << "\n=========== File Content ===========\n\n";
out << QString::fromStdString(content) << "\n";
out << "\n=================================\n";

// Print statistics
qInfo() << "Parsing successful, extracted" << content.length() << "characters";

} catch (const std::exception &e) {
qCritical() << "An error occurred during parsing:" << e.what();
return 1;
} catch (...) {
qCritical() << "An unknown error occurred during parsing";
return 1;
}

return 0;
}
Loading