|
| 1 | +require 'strscan' |
| 2 | + |
| 3 | +module Docsplit |
| 4 | + |
| 5 | + # Cleans up OCR'd text by using a series of heuristics to remove garbage |
| 6 | + # words. Algorithms taken from: |
| 7 | + # |
| 8 | + # Automatic Removal of "Garbage Strings" in OCR Text: An Implementation |
| 9 | + # -- Taghva, Nartker, Condit, and Borsack |
| 10 | + # |
| 11 | + # Improving Search and Retrieval Performance through Shortening Documents, |
| 12 | + # Detecting Garbage, and Throwing out Jargon |
| 13 | + # -- Kulp |
| 14 | + # |
| 15 | + class TextCleaner |
| 16 | + |
| 17 | + # Cached regexes we plan on using. |
| 18 | + WORD = /\S+/ |
| 19 | + SPACE = /\s+/ |
| 20 | + NEWLINE = /[\r\n]/ |
| 21 | + ALNUM = /[a-z0-9]/i |
| 22 | + PUNCT = /[^a-z0-9\s]/i |
| 23 | + REPEAT = /(.)\1{2,}/ |
| 24 | + UPPER = /[A-Z]/ |
| 25 | + LOWER = /[a-z]/ |
| 26 | + ACRONYM = /^\(?[A-Z]+('?s|[.,])?\)?$/ |
| 27 | + ALL_ALPHA = /^[a-z]+$/i |
| 28 | + CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i |
| 29 | + VOWEL = /([aeiou]|y$)/i |
| 30 | + CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i |
| 31 | + VOWEL_4 = /[aeiou]{4}/i |
| 32 | + REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/ |
| 33 | + SINGLETONS = /^[AaIi]$/ |
| 34 | + |
| 35 | + # For the time being, `clean` uses the regular StringScanner, and not the |
| 36 | + # multibyte-aware version. |
| 37 | + def clean(text) |
| 38 | + scanner = StringScanner.new(text) |
| 39 | + cleaned = [] |
| 40 | + spaced = false |
| 41 | + loop do |
| 42 | + if space = scanner.scan(SPACE) |
| 43 | + cleaned.push(space) unless spaced && (space !~ NEWLINE) |
| 44 | + spaced = true |
| 45 | + elsif word = scanner.scan(WORD) |
| 46 | + unless garbage(word) |
| 47 | + cleaned.push(word) |
| 48 | + spaced = false |
| 49 | + end |
| 50 | + elsif scanner.eos? |
| 51 | + return cleaned.join('').gsub(REPEATED, '') |
| 52 | + end |
| 53 | + end |
| 54 | + end |
| 55 | + |
| 56 | + # Is a given word OCR garbage? |
| 57 | + def garbage(w) |
| 58 | + # More than 20 bytes in length. |
| 59 | + (w.length > 20) || |
| 60 | + |
| 61 | + # If there are three or more identical characters in a row in the string. |
| 62 | + (w =~ REPEAT) || |
| 63 | + |
| 64 | + # More punctuation than alpha numerics. |
| 65 | + (w.scan(ALNUM).length < w.scan(PUNCT).length) || |
| 66 | + |
| 67 | + # Ignoring the first and last characters in the string, if there are three or |
| 68 | + # more different punctuation characters in the string. |
| 69 | + (w[1...-1].scan(PUNCT).uniq.length >= 3) || |
| 70 | + |
| 71 | + # Four or more consecutive vowels, or five or more consecutive consonants. |
| 72 | + ((w =~ VOWEL_4) || (w =~ CONSONANT_5)) || |
| 73 | + |
| 74 | + # Number of uppercase letters greater than lowercase letters, but the word is |
| 75 | + # not all uppercase + punctuation. |
| 76 | + ((w.scan(UPPER).length > w.scan(LOWER).length) && (w !~ ACRONYM)) || |
| 77 | + |
| 78 | + # Single letters that are not A or I. |
| 79 | + (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) || |
| 80 | + |
| 81 | + # All characters are alphabetic and there are 8 times more vowels than |
| 82 | + # consonants, or 8 times more consonants than vowels. |
| 83 | + ((w.length > 2 && (w =~ ALL_ALPHA) && (w !~ ACRONYM)) && |
| 84 | + (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) || |
| 85 | + (cons > vows * 8))) |
| 86 | + end |
| 87 | + |
| 88 | + end |
| 89 | + |
| 90 | +end |
0 commit comments