From c42ce95a597bcb03738182f77b27e55f365c6109 Mon Sep 17 00:00:00 2001
From: Andrew Volozhanin <linuxheadrus@gmail.com>
Date: Wed, 17 Jun 2015 20:13:24 +0500
Subject: [PATCH] Add layout option to keep layout during text extraction

It passed -layout option to pdftotext.
---
 lib/docsplit/text_extractor.rb | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index 985abdd..93973f6 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -102,17 +102,26 @@ def run(command)
       result
     end
 
+    # Run pdftotext command
+    def run_pdftotext(pdf, text_path, options=[])
+      options << '-enc UTF-8'
+      options << '-layout' if @keep_layout
+
+      run "pdftotext #{options.join(' ')} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+    end
+
     # Extract the full contents of a pdf as a single file, directly.
     def extract_full(pdf)
       text_path = File.join(@output, "#{@pdf_name}.txt")
-      run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      run_pdftotext pdf, text_path
     end
 
     # Extract the contents of a single page of text, directly, adding it to
     # the `@pages_to_ocr` list if the text length is inadequate.
     def extract_page(pdf, page)
       text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
-      run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      run_pdftotext pdf, text_path, ["-f #{page}", "-l #{page}"]
+
       unless @forbid_ocr
         @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
       end
@@ -126,6 +135,7 @@ def extract_options(options)
       @language           = options[:language] || 'eng'
       @clean_ocr          = (!(options[:clean] == false) and @language == 'eng')
       @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
+      @keep_layout        = options.fetch(:layout, false)
     end
 
   end