First draft of OCR cleanup ... Docsplit::TextCleaner

jashkenas · jashkenas · commit 41e257a869a0 · 2010-10-18T13:09:50.000-04:00
diff --git a/lib/docsplit.rb b/lib/docsplit.rb
@@ -72,6 +72,11 @@ def self.extract_#{key}(pdfs, opts={})
     EOS
   end
 
+  # Utility method to clean OCR'd text with garbage characters.
+  def self.clean_text(text)
+    TextCleaner.new.clean(text)
+  end
+
 
   private
 
@@ -103,3 +108,4 @@ def self.normalize_value(value)
 require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
 require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
 require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
+require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb
@@ -71,7 +71,7 @@ def usage
     # Use the OptionParser library to parse out all supported options. Return
     # options formatted for the Ruby API.
     def parse_options
-      @options = {:ocr => :default}
+      @options = {:ocr => :default, :clean => true}
       @option_parser = OptionParser.new do |opts|
         opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
           @options[:output] = d
@@ -88,6 +88,9 @@ def parse_options
         opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
           @options[:ocr] = o
         end
+        opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
+          @options[:clean] = false
+        end
         opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
           @options[:rolling] = true
         end
diff --git a/lib/docsplit/text_cleaner.rb b/lib/docsplit/text_cleaner.rb
@@ -0,0 +1,90 @@
+require 'strscan'
+
+module Docsplit
+
+  # Cleans up OCR'd text by using a series of heuristics to remove garbage
+  # words. Algorithms taken from:
+  #
+  #     Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
+  #       -- Taghva, Nartker, Condit, and Borsack
+  #
+  #     Improving Search and Retrieval Performance through Shortening Documents,
+  #     Detecting Garbage, and Throwing out Jargon
+  #       -- Kulp
+  #
+  class TextCleaner
+
+    # Cached regexes we plan on using.
+    WORD        = /\S+/
+    SPACE       = /\s+/
+    NEWLINE     = /[\r\n]/
+    ALNUM       = /[a-z0-9]/i
+    PUNCT       = /[^a-z0-9\s]/i
+    REPEAT      = /(.)\1{2,}/
+    UPPER       = /[A-Z]/
+    LOWER       = /[a-z]/
+    ACRONYM     = /^\(?[A-Z]+('?s|[.,])?\)?$/
+    ALL_ALPHA   = /^[a-z]+$/i
+    CONSONANT   = /(^y|[bcdfghjklmnpqrstvwxz])/i
+    VOWEL       = /([aeiou]|y$)/i
+    CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
+    VOWEL_4     = /[aeiou]{4}/i
+    REPEATED    = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
+    SINGLETONS  = /^[AaIi]$/
+
+    # For the time being, `clean` uses the regular StringScanner, and not the
+    # multibyte-aware version.
+    def clean(text)
+      scanner = StringScanner.new(text)
+      cleaned = []
+      spaced  = false
+      loop do
+        if space = scanner.scan(SPACE)
+          cleaned.push(space) unless spaced && (space !~ NEWLINE)
+          spaced = true
+        elsif word = scanner.scan(WORD)
+          unless garbage(word)
+            cleaned.push(word)
+            spaced = false
+          end
+        elsif scanner.eos?
+          return cleaned.join('').gsub(REPEATED, '')
+        end
+      end
+    end
+
+    # Is a given word OCR garbage?
+    def garbage(w)
+      # More than 20 bytes in length.
+      (w.length > 20) ||
+
+      # If there are three or more identical characters in a row in the string.
+      (w =~ REPEAT) ||
+
+      # More punctuation than alpha numerics.
+      (w.scan(ALNUM).length < w.scan(PUNCT).length) ||
+
+      # Ignoring the first and last characters in the string, if there are three or
+      # more different punctuation characters in the string.
+      (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
+
+      # Four or more consecutive vowels, or five or more consecutive consonants.
+      ((w =~ VOWEL_4) || (w =~ CONSONANT_5)) ||
+
+      # Number of uppercase letters greater than lowercase letters, but the word is
+      # not all uppercase + punctuation.
+      ((w.scan(UPPER).length > w.scan(LOWER).length) && (w !~ ACRONYM)) ||
+
+      # Single letters that are not A or I.
+      (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
+
+      # All characters are alphabetic and there are 8 times more vowels than
+      # consonants, or 8 times more consonants than vowels.
+      ((w.length > 2 && (w =~ ALL_ALPHA) && (w !~ ACRONYM)) &&
+        (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
+          (cons > vows * 8)))
+    end
+
+  end
+
+end
diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
@@ -62,14 +62,17 @@ def extract_from_ocr(pdf, pages)
       if pages
         pages.each do |page|
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
+          file = "#{base_path}_#{page}"
           run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
-          run "tesseract #{tiff} #{base_path}_#{page} 2>&1"
+          run "tesseract #{tiff} #{file} 2>&1"
+          clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
         end
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
         run "tesseract #{tiff} #{base_path} -l eng 2>&1"
+        clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
       FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
@@ -78,6 +81,15 @@ def extract_from_ocr(pdf, pages)
 
     private
 
+    def clean_text(file)
+      File.open(file, 'r+') do |f|
+        text = f.read
+        f.truncate(0)
+        f.rewind
+        f.write(Docsplit.clean_text(text))
+      end
+    end
+
     # Run an external process and raise an exception if it fails.
     def run(command)
       result = `#{command}`
@@ -106,6 +118,7 @@ def extract_options(options)
       @pages      = options[:pages]
       @force_ocr  = options[:ocr] == true
       @forbid_ocr = options[:ocr] == false
+      @clean_ocr  = options[:clean]
     end
 
   end