File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 11Gem ::Specification . new do |s |
22 s . name = 'docsplit'
3- s . version = '0.3.3 ' # Keep version in sync with docsplit.rb
4- s . date = '2010-8-17 '
3+ s . version = '0.3.4 ' # Keep version in sync with docsplit.rb
4+ s . date = '2010-8-20 '
55
66 s . homepage = "http://documentcloud.github.com/docsplit/"
77 s . summary = "Break Apart Documents into Images, Text, Pages and PDFs"
Original file line number Diff line number Diff line change @@ -98,7 +98,7 @@ <h1>Doc<sub style="font-size:150%;">⚡</sub>split</h1>
9898 (title, author, number of pages...)
9999 </ p >
100100
101- < p > Docsplit is currently at < a href ="http://rubygems.org/gems/docsplit "> version 0.3.3 </ a > .</ p >
101+ < p > Docsplit is currently at < a href ="http://rubygems.org/gems/docsplit "> version 0.3.4 </ a > .</ p >
102102
103103 < p >
104104 < i > Docsplit is an open-source component of < a href ="http://documentcloud.org/ "> DocumentCloud</ a > .</ i >
Original file line number Diff line number Diff line change 11# The Docsplit module delegates to the Java PDF extractors.
22module Docsplit
33
4- VERSION = '0.3.3 ' # Keep in sync with gemspec.
4+ VERSION = '0.3.4 ' # Keep in sync with gemspec.
55
66 ROOT = File . expand_path ( File . dirname ( __FILE__ ) + '/..' )
77
Original file line number Diff line number Diff line change @@ -5,7 +5,7 @@ module Docsplit
55 class ImageExtractor
66
77 DENSITY_ARG = "-density 150"
8- MEMORY_ARGS = "-limit memory 128MiB -limit map 256MiB "
8+ MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB "
99 DEFAULT_FORMAT = :png
1010
1111 # Extract a list of PDFs as rasterized page images, according to the
Original file line number Diff line number Diff line change @@ -17,13 +17,12 @@ class TextExtractor
1717 NO_TEXT_DETECTED = /---------\n \Z /
1818
1919 OCR_FLAGS = '-density 200x200 -colorspace GRAY'
20- MEMORY_ARGS = '-limit memory 128MiB -limit map 256MiB '
20+ MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB '
2121
2222 MIN_TEXT_PER_PAGE = 100 # in bytes
2323
2424 def initialize
25- @tiffs_generated = false
26- @pages_to_ocr = [ ]
25+ @pages_to_ocr = [ ]
2726 end
2827
2928 # Extract text from a list of PDFs.
@@ -61,10 +60,11 @@ def extract_from_ocr(pdf, pages)
6160 tempdir = Dir . mktmpdir
6261 base_path = File . join ( @output , @pdf_name )
6362 if pages
64- run "MAGICK_TMPDIR=#{ tempdir } OMP_NUM_THREADS=2 gm convert +adjoin #{ MEMORY_ARGS } #{ OCR_FLAGS } #{ pdf } #{ tempdir } /#{ @pdf_name } _%d.tif 2>&1" unless @tiffs_generated
65- @tiffs_generated = true
6663 pages . each do |page |
67- run "tesseract #{ tempdir } /#{ @pdf_name } _#{ page - 1 } .tif #{ base_path } _#{ page } 2>&1"
64+ tiff = "#{ tempdir } /#{ @pdf_name } _#{ page } .tif"
65+ run "MAGICK_TMPDIR=#{ tempdir } OMP_NUM_THREADS=2 gm convert +adjoin #{ MEMORY_ARGS } #{ OCR_FLAGS } #{ pdf } [#{ page - 1 } ] #{ tiff } 2>&1"
66+ run "tesseract #{ tiff } #{ base_path } _#{ page } 2>&1"
67+ FileUtils . remove_entry_secure tiff
6868 end
6969 else
7070 tiff = "#{ tempdir } /#{ @pdf_name } .tif"
You can’t perform that action at this time.
0 commit comments