From c42ce95a597bcb03738182f77b27e55f365c6109 Mon Sep 17 00:00:00 2001 From: Andrew Volozhanin Date: Wed, 17 Jun 2015 20:13:24 +0500 Subject: [PATCH] Add layout option to keep layout during text extraction It passed -layout option to pdftotext. --- lib/docsplit/text_extractor.rb | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 985abdd..93973f6 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -102,17 +102,26 @@ def run(command) result end + # Run pdftotext command + def run_pdftotext(pdf, text_path, options=[]) + options << '-enc UTF-8' + options << '-layout' if @keep_layout + + run "pdftotext #{options.join(' ')} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + end + # Extract the full contents of a pdf as a single file, directly. def extract_full(pdf) text_path = File.join(@output, "#{@pdf_name}.txt") - run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + run_pdftotext pdf, text_path end # Extract the contents of a single page of text, directly, adding it to # the `@pages_to_ocr` list if the text length is inadequate. def extract_page(pdf, page) text_path = File.join(@output, "#{@pdf_name}_#{page}.txt") - run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + run_pdftotext pdf, text_path, ["-f #{page}", "-l #{page}"] + unless @forbid_ocr @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE end @@ -126,6 +135,7 @@ def extract_options(options) @language = options[:language] || 'eng' @clean_ocr = (!(options[:clean] == false) and @language == 'eng') @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd]) + @keep_layout = options.fetch(:layout, false) end end