Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ pkg_check_modules(DEPS REQUIRED
libxml-2.0
uuid
tinyxml2
libmagic
)

# 添加子目录
Expand Down
12 changes: 12 additions & 0 deletions debian/changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
docparser (1.0.19) unstable; urgency=medium

* integrate libmagic for MIME type detection in docparser

-- Zhang Sheng <zhangsheng@uniontech.com> Mon, 26 May 2025 14:21:27 +0800

docparser (1.0.18) unstable; urgency=medium

* refactor

-- Zhang Sheng <zhangsheng@uniontech.com> Thu, 22 May 2025 08:28:35 +0800

docparser (1.0.17) unstable; urgency=medium

* add tests
Expand Down
3 changes: 2 additions & 1 deletion debian/control
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ Build-Depends:
libfreetype6-dev|libfreetype-dev,
libxml2-dev,
uuid-dev,
libtinyxml2-dev
libtinyxml2-dev,
libmagic-dev
Standards-Version: 4.3.0

Package: libdocparser
Expand Down
72 changes: 71 additions & 1 deletion src/docparser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
#include <iostream>
#include <cstring>
#include <unordered_set>
#include <unordered_map>

Check warning on line 23 in src/docparser.cpp

View workflow job for this annotation

GitHub Actions / cppcheck

Include file: <unordered_map> not found. Please note: Cppcheck does not need standard library headers to get proper results.
#include <string_view>

Check warning on line 24 in src/docparser.cpp

View workflow job for this annotation

GitHub Actions / cppcheck

Include file: <string_view> not found. Please note: Cppcheck does not need standard library headers to get proper results.
#include <algorithm>

Check warning on line 25 in src/docparser.cpp

View workflow job for this annotation

GitHub Actions / cppcheck

Include file: <algorithm> not found. Please note: Cppcheck does not need standard library headers to get proper results.
#include <magic.h>

Check warning on line 26 in src/docparser.cpp

View workflow job for this annotation

GitHub Actions / cppcheck

Include file: <magic.h> not found. Please note: Cppcheck does not need standard library headers to get proper results.

static bool isTextSuffix(std::string_view suffix)
{
Expand All @@ -39,6 +40,65 @@
return validSuffixes.count(lowercaseSuffix) > 0;
}

/**
* @brief Check if a file is a text file using libmagic MIME type detection
* @param filename The path to the file to check
* @return true if the file is detected as text, false otherwise
*/
static bool isTextFileByMimeType(const std::string &filename)
{
magic_t magic_cookie = magic_open(MAGIC_MIME_TYPE);
if (magic_cookie == nullptr) {
std::cerr << "ERROR: [isTextFileByMimeType] Failed to initialize libmagic" << std::endl;
return false;
}

if (magic_load(magic_cookie, nullptr) != 0) {
std::cerr << "ERROR: [isTextFileByMimeType] Failed to load magic database: "
<< magic_error(magic_cookie) << std::endl;
magic_close(magic_cookie);
return false;
}

const char *mime_type = magic_file(magic_cookie, filename.c_str());
if (mime_type == nullptr) {
std::cerr << "ERROR: [isTextFileByMimeType] Failed to detect MIME type for "
<< filename << ": " << magic_error(magic_cookie) << std::endl;
magic_close(magic_cookie);
return false;
}

std::string mimeStr(mime_type);
magic_close(magic_cookie);

std::cout << "INFO: [isTextFileByMimeType] Detected MIME type: " << mimeStr
<< " for file: " << filename << std::endl;

// Check if MIME type starts with "text/"
bool isText = mimeStr.substr(0, 5) == "text/";

// Also consider some application types that are actually text
if (!isText) {
static const std::unordered_set<std::string> textApplicationTypes = {
"application/json",
"application/xml",
"application/javascript",
"application/x-sh",
"application/x-shellscript",
"application/x-perl",
"application/x-python",
"application/x-ruby",
"application/x-awk",
"application/x-desktop",
"application/x-yaml"
};

isText = textApplicationTypes.count(mimeStr) > 0;
}

return isText;
}

// 预处理后缀映射,避免多次strcasecmp比较
using FileCreator = std::unique_ptr<fileext::FileExtension> (*)(const std::string &, const std::string &);

Expand Down Expand Up @@ -157,7 +217,17 @@
if (it != extensionMap.end()) {
document = it->second(filename, suffix);
} else {
throw std::logic_error("Unsupported file extension: " + filename);
// Extension not found in map, check if it's a text file by content
std::cout << "INFO: [doConvertFile] Unknown file extension '" << suffix
<< "', checking file content for text type: " << filename << std::endl;

if (isTextFileByMimeType(filename)) {
std::cout << "INFO: [doConvertFile] File detected as text by MIME type analysis: "
<< filename << std::endl;
document = createTxt(filename, suffix);
} else {
throw std::logic_error("Unsupported file extension: " + filename);
}
}
}

Expand Down
40 changes: 38 additions & 2 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
find_package(Qt6 REQUIRED COMPONENTS Core)
set(CMAKE_AUTOMOC ON)
find_package(Qt6 REQUIRED COMPONENTS Core Test)
find_package(Dtk6 REQUIRED COMPONENTS Core)

# Enable testing
enable_testing()

# 设置测试程序的源文件
set(TEST_SOURCES
main.cpp
)

# 设置自动化测试的源文件
set(AUTOTEST_SOURCES
autotest.cpp
)

# 创建测试可执行文件
add_executable(docparser_test ${TEST_SOURCES})

# 创建自动化测试可执行文件
add_executable(docparser_autotest ${AUTOTEST_SOURCES})

# 链接docparser库和Qt6Core
target_link_libraries(docparser_test
PRIVATE
Expand All @@ -17,13 +29,37 @@ target_link_libraries(docparser_test
Dtk6::Core
)

# 链接自动化测试所需的库
target_link_libraries(docparser_autotest
PRIVATE
docparser
Qt6::Core
Qt6::Test
)

# 设置包含目录
target_include_directories(docparser_test
PRIVATE
${CMAKE_SOURCE_DIR}/src
)

target_include_directories(docparser_autotest
PRIVATE
${CMAKE_SOURCE_DIR}/src
)

# 添加测试到CTest
add_test(
NAME DocParserAutoTest
COMMAND docparser_autotest
)

# 设置测试属性
set_tests_properties(DocParserAutoTest PROPERTIES
TIMEOUT 300 # 5 minutes timeout for all tests
)

# 安装测试程序(可选)
install(TARGETS docparser_test
install(TARGETS docparser_test docparser_autotest
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
)
Loading
Loading