Skip to content

Commit 41e257a

Browse files
committed
First draft of OCR cleanup ... Docsplit::TextCleaner
1 parent e2d8c2d commit 41e257a

4 files changed

Lines changed: 114 additions & 2 deletions

File tree

lib/docsplit.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,11 @@ def self.extract_#{key}(pdfs, opts={})
7272
EOS
7373
end
7474

75+
# Utility method to clean OCR'd text with garbage characters.
76+
def self.clean_text(text)
77+
TextCleaner.new.clean(text)
78+
end
79+
7580

7681
private
7782

@@ -103,3 +108,4 @@ def self.normalize_value(value)
103108
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
104109
require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
105110
require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
111+
require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"

lib/docsplit/command_line.rb

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def usage
7171
# Use the OptionParser library to parse out all supported options. Return
7272
# options formatted for the Ruby API.
7373
def parse_options
74-
@options = {:ocr => :default}
74+
@options = {:ocr => :default, :clean => true}
7575
@option_parser = OptionParser.new do |opts|
7676
opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
7777
@options[:output] = d
@@ -88,6 +88,9 @@ def parse_options
8888
opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
8989
@options[:ocr] = o
9090
end
91+
opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
92+
@options[:clean] = false
93+
end
9194
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
9295
@options[:rolling] = true
9396
end

lib/docsplit/text_cleaner.rb

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
require 'strscan'
2+
3+
module Docsplit
4+
5+
# Cleans up OCR'd text by using a series of heuristics to remove garbage
6+
# words. Algorithms taken from:
7+
#
8+
# Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
9+
# -- Taghva, Nartker, Condit, and Borsack
10+
#
11+
# Improving Search and Retrieval Performance through Shortening Documents,
12+
# Detecting Garbage, and Throwing out Jargon
13+
# -- Kulp
14+
#
15+
class TextCleaner
16+
17+
# Cached regexes we plan on using.
18+
WORD = /\S+/
19+
SPACE = /\s+/
20+
NEWLINE = /[\r\n]/
21+
ALNUM = /[a-z0-9]/i
22+
PUNCT = /[^a-z0-9\s]/i
23+
REPEAT = /(.)\1{2,}/
24+
UPPER = /[A-Z]/
25+
LOWER = /[a-z]/
26+
ACRONYM = /^\(?[A-Z]+('?s|[.,])?\)?$/
27+
ALL_ALPHA = /^[a-z]+$/i
28+
CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
29+
VOWEL = /([aeiou]|y$)/i
30+
CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
31+
VOWEL_4 = /[aeiou]{4}/i
32+
REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
33+
SINGLETONS = /^[AaIi]$/
34+
35+
# For the time being, `clean` uses the regular StringScanner, and not the
36+
# multibyte-aware version.
37+
def clean(text)
38+
scanner = StringScanner.new(text)
39+
cleaned = []
40+
spaced = false
41+
loop do
42+
if space = scanner.scan(SPACE)
43+
cleaned.push(space) unless spaced && (space !~ NEWLINE)
44+
spaced = true
45+
elsif word = scanner.scan(WORD)
46+
unless garbage(word)
47+
cleaned.push(word)
48+
spaced = false
49+
end
50+
elsif scanner.eos?
51+
return cleaned.join('').gsub(REPEATED, '')
52+
end
53+
end
54+
end
55+
56+
# Is a given word OCR garbage?
57+
def garbage(w)
58+
# More than 20 bytes in length.
59+
(w.length > 20) ||
60+
61+
# If there are three or more identical characters in a row in the string.
62+
(w =~ REPEAT) ||
63+
64+
# More punctuation than alpha numerics.
65+
(w.scan(ALNUM).length < w.scan(PUNCT).length) ||
66+
67+
# Ignoring the first and last characters in the string, if there are three or
68+
# more different punctuation characters in the string.
69+
(w[1...-1].scan(PUNCT).uniq.length >= 3) ||
70+
71+
# Four or more consecutive vowels, or five or more consecutive consonants.
72+
((w =~ VOWEL_4) || (w =~ CONSONANT_5)) ||
73+
74+
# Number of uppercase letters greater than lowercase letters, but the word is
75+
# not all uppercase + punctuation.
76+
((w.scan(UPPER).length > w.scan(LOWER).length) && (w !~ ACRONYM)) ||
77+
78+
# Single letters that are not A or I.
79+
(w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
80+
81+
# All characters are alphabetic and there are 8 times more vowels than
82+
# consonants, or 8 times more consonants than vowels.
83+
((w.length > 2 && (w =~ ALL_ALPHA) && (w !~ ACRONYM)) &&
84+
(((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
85+
(cons > vows * 8)))
86+
end
87+
88+
end
89+
90+
end

lib/docsplit/text_extractor.rb

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,14 +62,17 @@ def extract_from_ocr(pdf, pages)
6262
if pages
6363
pages.each do |page|
6464
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
65+
file = "#{base_path}_#{page}"
6566
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
66-
run "tesseract #{tiff} #{base_path}_#{page} 2>&1"
67+
run "tesseract #{tiff} #{file} 2>&1"
68+
clean_text(file + '.txt') if @clean_ocr
6769
FileUtils.remove_entry_secure tiff
6870
end
6971
else
7072
tiff = "#{tempdir}/#{@pdf_name}.tif"
7173
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
7274
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
75+
clean_text(base_path + '.txt') if @clean_ocr
7376
end
7477
ensure
7578
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
@@ -78,6 +81,15 @@ def extract_from_ocr(pdf, pages)
7881

7982
private
8083

84+
def clean_text(file)
85+
File.open(file, 'r+') do |f|
86+
text = f.read
87+
f.truncate(0)
88+
f.rewind
89+
f.write(Docsplit.clean_text(text))
90+
end
91+
end
92+
8193
# Run an external process and raise an exception if it fails.
8294
def run(command)
8395
result = `#{command}`
@@ -106,6 +118,7 @@ def extract_options(options)
106118
@pages = options[:pages]
107119
@force_ocr = options[:ocr] == true
108120
@forbid_ocr = options[:ocr] == false
121+
@clean_ocr = options[:clean]
109122
end
110123

111124
end

0 commit comments

Comments
 (0)