-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf_ocr_replicate.py
More file actions
executable file
·103 lines (90 loc) · 4.01 KB
/
pdf_ocr_replicate.py
File metadata and controls
executable file
·103 lines (90 loc) · 4.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env -S uv run
# /// script
# dependencies = [
# "requests>=2.31",
# "replicate>=0.25",
# ]
# requires-python = ">=3.11"
# ///
import argparse
import os
import sys
import json
import requests
import replicate
def available_ocr_models():
return ["cuuupid-marker", "cudanexus-nougat", "awilliamson10-meta-nougat"]
def generate_ocr(input_file, output_file, model):
if model == "cuuupid-marker":
output = replicate.run(
"cuuupid/marker:9c67051309f6d10ca139489f15fcb5ebc4866a3734af537c181fb13bc719d280",
input={
"dpi": 400,
"lang": "English",
"document": open(input_file, "rb"),
"enable_editor": False,
"parallel_factor": 10
}
)
elif model == "cudanexus-nougat":
output = replicate.run(
"cudanexus/nougat:d0b4e90da423598ff84debc9115bf891dd819843600ad842c0c178e3571f9e76",
input={"pdf_file": open(input_file, "rb")}
)
elif model == "awilliamson10-meta-nougat":
output = replicate.run(
"awilliamson10/meta-nougat:872fa99400b0eeb8bfc82ef433aa378976b4311178ff64fed439470249902071",
input={"pdf_link": input_file}
)
else:
raise ValueError(f"Unsupported model: {model}")
json_output_file = output_file.rsplit('.', 1)[0] + '.json'
with open(json_output_file, "w") as f:
json.dump(output, f)
markdown_url = output.get('markdown')
if markdown_url:
response = requests.get(markdown_url)
response.raise_for_status()
with open(output_file, "w") as f:
f.write(response.text)
else:
print("No markdown URL found in the server response.")
def main():
parser = argparse.ArgumentParser(description="OCR PDF to Markdown converter")
parser.add_argument("input_file", help="Input PDF file")
parser.add_argument("-o", "--output", help="Custom output filename")
parser.add_argument("-m", "--model", choices=available_ocr_models(), default="marker", help="OCR model to use")
parser.add_argument("--all", action='store_true', help="Run processing through all available OCR models")
parser.add_argument("-D", "--output-dir", help="Output directory")
args = parser.parse_args()
if not os.path.exists(args.input_file):
print(f"Error: Input file '{args.input_file}' does not exist.")
sys.exit(1)
output_dir = args.output_dir if args.output_dir else ""
if output_dir and not os.path.isdir(output_dir):
print(f"Error: Output directory '{output_dir}' does not exist or is not a directory.")
sys.exit(1)
output_file = os.path.join(output_dir, args.output if args.output else os.path.splitext(os.path.basename(args.input_file))[0] + ".md")
json_output_file = output_file.rsplit('.', 1)[0] + '.md.json'
if os.path.exists(output_file) or os.path.exists(json_output_file):
response = input(f"Warning: Output file '{output_file}' or '{json_output_file}' already exists. Overwrite? (y/n): ")
if response.lower() != 'y':
print("Operation cancelled.")
sys.exit(0)
if os.path.exists(output_file):
response = input(f"Warning: Output file '{output_file}' already exists. Overwrite? (y/n): ")
if response.lower() != 'y':
print("Operation cancelled.")
sys.exit(0)
if args.all:
for model in available_ocr_models():
model_output_file = os.path.join(output_dir, output_file.rsplit('.', 1)[0] + f'.{model}.md')
model_json_output_file = model_output_file.rsplit('.', 1)[0] + '.md.json'
if os.path.exists(model_output_file) or os.path.exists(model_json_output_file):
print(f"Warning: Output file '{model_output_file}' or '{model_json_output_file}' already exists. Skipping model {model}.")
continue
generate_ocr(args.input_file, model_output_file, model)
else:
generate_ocr(args.input_file, output_file, args.model)
if __name__ == "__main__":
main()