Skip to content

Commit 5efc676

Browse files
committed
further tweaks to image/text extraction.
1 parent 8901441 commit 5efc676

5 files changed

Lines changed: 11 additions & 11 deletions

File tree

docsplit.gemspec

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Gem::Specification.new do |s|
22
s.name = 'docsplit'
3-
s.version = '0.3.3' # Keep version in sync with docsplit.rb
4-
s.date = '2010-8-17'
3+
s.version = '0.3.4' # Keep version in sync with docsplit.rb
4+
s.date = '2010-8-20'
55

66
s.homepage = "http://documentcloud.github.com/docsplit/"
77
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"

index.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ <h1>Doc<sub style="font-size:150%;">&#9889;</sub>split</h1>
9898
(title, author, number of pages...)
9999
</p>
100100

101-
<p>Docsplit is currently at <a href="http://rubygems.org/gems/docsplit">version 0.3.3</a>.</p>
101+
<p>Docsplit is currently at <a href="http://rubygems.org/gems/docsplit">version 0.3.4</a>.</p>
102102

103103
<p>
104104
<i>Docsplit is an open-source component of <a href="http://documentcloud.org/">DocumentCloud</a>.</i>

lib/docsplit.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# The Docsplit module delegates to the Java PDF extractors.
22
module Docsplit
33

4-
VERSION = '0.3.3' # Keep in sync with gemspec.
4+
VERSION = '0.3.4' # Keep in sync with gemspec.
55

66
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
77

lib/docsplit/image_extractor.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ module Docsplit
55
class ImageExtractor
66

77
DENSITY_ARG = "-density 150"
8-
MEMORY_ARGS = "-limit memory 128MiB -limit map 256MiB"
8+
MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
99
DEFAULT_FORMAT = :png
1010

1111
# Extract a list of PDFs as rasterized page images, according to the

lib/docsplit/text_extractor.rb

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,12 @@ class TextExtractor
1717
NO_TEXT_DETECTED = /---------\n\Z/
1818

1919
OCR_FLAGS = '-density 200x200 -colorspace GRAY'
20-
MEMORY_ARGS = '-limit memory 128MiB -limit map 256MiB'
20+
MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
2121

2222
MIN_TEXT_PER_PAGE = 100 # in bytes
2323

2424
def initialize
25-
@tiffs_generated = false
26-
@pages_to_ocr = []
25+
@pages_to_ocr = []
2726
end
2827

2928
# Extract text from a list of PDFs.
@@ -61,10 +60,11 @@ def extract_from_ocr(pdf, pages)
6160
tempdir = Dir.mktmpdir
6261
base_path = File.join(@output, @pdf_name)
6362
if pages
64-
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
65-
@tiffs_generated = true
6663
pages.each do |page|
67-
run "tesseract #{tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
64+
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
65+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
66+
run "tesseract #{tiff} #{base_path}_#{page} 2>&1"
67+
FileUtils.remove_entry_secure tiff
6868
end
6969
else
7070
tiff = "#{tempdir}/#{@pdf_name}.tif"

0 commit comments

Comments
 (0)