Skip to content

Commit 26e6615

Browse files
author
Michal Barla
committed
Allow language parameter for tesseract text extraction
1 parent fe690ca commit 26e6615

2 files changed

Lines changed: 8 additions & 2 deletions

File tree

lib/docsplit/text_extractor.rb

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,15 +66,15 @@ def extract_from_ocr(pdf, pages)
6666
escaped_tiff = ESCAPE[tiff]
6767
file = "#{base_path}_#{page}"
6868
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
69-
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l eng 2>&1"
69+
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
7070
clean_text(file + '.txt') if @clean_ocr
7171
FileUtils.remove_entry_secure tiff
7272
end
7373
else
7474
tiff = "#{tempdir}/#{@pdf_name}.tif"
7575
escaped_tiff = ESCAPE[tiff]
7676
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
77-
run "tesseract #{escaped_tiff} #{base_path} -l eng 2>&1"
77+
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
7878
clean_text(base_path + '.txt') if @clean_ocr
7979
end
8080
ensure
@@ -122,6 +122,7 @@ def extract_options(options)
122122
@force_ocr = options[:ocr] == true
123123
@forbid_ocr = options[:ocr] == false
124124
@clean_ocr = !(options[:clean] == false)
125+
@language = options[:language] || 'eng'
125126
end
126127

127128
end

test/unit/test_extract_text.rb

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@ def test_ocr_extraction
3838
end
3939
end
4040

41+
def test_ocr_extraction_in_mock_language
42+
exception = assert_raise(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")}
43+
assert(exception.message.match("mock.traineddata"))
44+
end
45+
4146
def test_password_protected
4247
assert_raises(ExtractionFailed) do
4348
Docsplit.extract_text('test/fixtures/completely_encrypted.pdf')

0 commit comments

Comments
 (0)