further tweaks to image/text extraction.

jashkenas · jashkenas · commit 5efc6761540f · 2010-08-20T16:59:22.000-04:00
diff --git a/docsplit.gemspec b/docsplit.gemspec
@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'docsplit'
-  s.version   = '0.3.3'         # Keep version in sync with docsplit.rb
-  s.date      = '2010-8-17'
+  s.version   = '0.3.4'         # Keep version in sync with docsplit.rb
+  s.date      = '2010-8-20'
 
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"
diff --git a/index.html b/index.html
@@ -98,7 +98,7 @@ <h1>Doc<sub style="font-size:150%;">&#9889;</sub>split</h1>
       (title, author, number of pages...)
     </p>
 
-    <p>Docsplit is currently at <a href="http://rubygems.org/gems/docsplit">version 0.3.3</a>.</p>
+    <p>Docsplit is currently at <a href="http://rubygems.org/gems/docsplit">version 0.3.4</a>.</p>
 
     <p>
       <i>Docsplit is an open-source component of <a href="http://documentcloud.org/">DocumentCloud</a>.</i>
diff --git a/lib/docsplit.rb b/lib/docsplit.rb
@@ -1,7 +1,7 @@
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
 
-  VERSION       = '0.3.3' # Keep in sync with gemspec.
+  VERSION       = '0.3.4' # Keep in sync with gemspec.
 
   ROOT          = File.expand_path(File.dirname(__FILE__) + '/..')
 
diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
@@ -5,7 +5,7 @@ module Docsplit
   class ImageExtractor
 
     DENSITY_ARG     = "-density 150"
-    MEMORY_ARGS     = "-limit memory 128MiB -limit map 256MiB"
+    MEMORY_ARGS     = "-limit memory 256MiB -limit map 512MiB"
     DEFAULT_FORMAT  = :png
 
     # Extract a list of PDFs as rasterized page images, according to the
diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
@@ -17,13 +17,12 @@ class TextExtractor
     NO_TEXT_DETECTED = /---------\n\Z/
 
     OCR_FLAGS   = '-density 200x200 -colorspace GRAY'
-    MEMORY_ARGS = '-limit memory 128MiB -limit map 256MiB'
+    MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
 
     MIN_TEXT_PER_PAGE = 100 # in bytes
 
     def initialize
-      @tiffs_generated = false
-      @pages_to_ocr    = []
+      @pages_to_ocr = []
     end
 
     # Extract text from a list of PDFs.
@@ -61,10 +60,11 @@ def extract_from_ocr(pdf, pages)
       tempdir = Dir.mktmpdir
       base_path = File.join(@output, @pdf_name)
       if pages
-        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
-        @tiffs_generated = true
         pages.each do |page|
-          run "tesseract #{tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
+          tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
+          run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
+          run "tesseract #{tiff} #{base_path}_#{page} 2>&1"
+          FileUtils.remove_entry_secure tiff
         end
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"