-
Notifications
You must be signed in to change notification settings - Fork 214
Expand file tree
/
Copy pathtest_extract_text.rb
More file actions
executable file
·66 lines (55 loc) · 2.81 KB
/
test_extract_text.rb
File metadata and controls
executable file
·66 lines (55 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
here = File.expand_path(File.dirname(__FILE__))
require File.join(here, '..', 'test_helper')
require 'tmpdir'
class ExtractTextTest < Test::Unit::TestCase
def test_paged_extraction
Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT)
assert Dir["#{OUTPUT}/*.txt"].length == 2
assert File.read("#{OUTPUT}/obama_arts_1.txt").match("Paid for by Obama for America")
end
def test_page_only_extraction
Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 2..2, :output => OUTPUT)
assert Dir["#{OUTPUT}/*.txt"] == ["#{OUTPUT}/obama_arts_2.txt"]
end
def test_capitalized_pdf_extraction
Dir["#{OUTPUT}/*.txt"].each {|previous| FileUtils.rm(previous) }
Dir.mktmpdir do |dir|
FileUtils.cp('test/fixtures/obama_arts.pdf', "#{dir}/OBAMA_ARTS.PDF")
Docsplit.extract_text("#{dir}/OBAMA_ARTS.PDF", :pages => 2..2, :output => OUTPUT)
end
assert Dir["#{OUTPUT}/*.txt"] == ["#{OUTPUT}/OBAMA_ARTS_2.txt"]
end
def test_unicode_extraction
Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT)
assert Dir["#{OUTPUT}/*.txt"].length == 3
end
def test_ocr_extraction
Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT)
4.times do |i|
file = "corrosion_#{i + 1}.txt"
assert_directory_contains(OUTPUT, file)
assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size"
end
end
def test_ocr_extraction_in_mock_language
exception = assert_raise(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")}
assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"
end
def test_password_protected
assert_raises(ExtractionFailed) do
Docsplit.extract_text('test/fixtures/completely_encrypted.pdf')
end
end
def test_name_escaping_while_extracting_text
Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT)
assert Dir["#{OUTPUT}/*.txt"].length == 2
end
def test_leading_zeros_while_extracting_text
Docsplit.extract_text('test/fixtures/leading_zeros.pdf', :pages => 'all', :leading_zeros => true, :output => OUTPUT)
assert_directory_contains(OUTPUT, ['leading_zeros_01.txt', 'leading_zeros_02.txt',
'leading_zeros_03.txt', 'leading_zeros_04.txt',
'leading_zeros_05.txt', 'leading_zeros_06.txt',
'leading_zeros_07.txt', 'leading_zeros_08.txt',
'leading_zeros_09.txt', 'leading_zeros_10.txt'])
end
end