Skip to content

Commit 085f5f6

Browse files
committed
Merge pull request #24 from minio-sk/feature-multiple_languages
Feature multiple languages
2 parents 18447cd + e1f0d16 commit 085f5f6

3 files changed

Lines changed: 12 additions & 3 deletions

File tree

lib/docsplit.rb

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@ module Docsplit
1111

1212
HEADLESS = "-Djava.awt.headless=true"
1313

14-
OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : '-Doffice.home=/usr/lib/openoffice'
14+
office ||= "/usr/lib/openoffice" if File.exists? '/usr/lib/openoffice'
15+
office ||= "/usr/lib/libreoffice" if File.exists? '/usr/lib/libreoffice'
16+
17+
OFFICE = RUBY_PLATFORM.match(/darwin/i) ? '' : "-Doffice.home=#{office}"
1518

1619
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
1720

lib/docsplit/text_extractor.rb

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,15 +66,15 @@ def extract_from_ocr(pdf, pages)
6666
escaped_tiff = ESCAPE[tiff]
6767
file = "#{base_path}_#{page}"
6868
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
69-
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l eng 2>&1"
69+
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
7070
clean_text(file + '.txt') if @clean_ocr
7171
FileUtils.remove_entry_secure tiff
7272
end
7373
else
7474
tiff = "#{tempdir}/#{@pdf_name}.tif"
7575
escaped_tiff = ESCAPE[tiff]
7676
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
77-
run "tesseract #{escaped_tiff} #{base_path} -l eng 2>&1"
77+
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
7878
clean_text(base_path + '.txt') if @clean_ocr
7979
end
8080
ensure
@@ -122,6 +122,7 @@ def extract_options(options)
122122
@force_ocr = options[:ocr] == true
123123
@forbid_ocr = options[:ocr] == false
124124
@clean_ocr = !(options[:clean] == false)
125+
@language = options[:language] || 'eng'
125126
end
126127

127128
end

test/unit/test_extract_text.rb

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@ def test_ocr_extraction
3838
end
3939
end
4040

41+
def test_ocr_extraction_in_mock_language
42+
exception = assert_raise(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")}
43+
assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"
44+
end
45+
4146
def test_password_protected
4247
assert_raises(ExtractionFailed) do
4348
Docsplit.extract_text('test/fixtures/completely_encrypted.pdf')

0 commit comments

Comments
 (0)