tesseract-002
Tests that hOCR output works
Test is expected to pass.
The pipeline
<p:declare-step xmlns:cx="http://xmlcalabash.com/ns/extensions"
xmlns:p="http://www.w3.org/ns/xproc"
xmlns:t="http://xproc.org/ns/testsuite/3.0" name="main" version="3.0">
<p:import href="https://xmlcalabash.com/ext/library/pdf-steps.xpl"/>
<p:import href="https://xmlcalabash.com/ext/library/tesseract.xpl"/>
<p:output port="result"/>
<cx:pdf-to-images dpi="300">
<p:with-input port="source"
href="../documents/example.pdf"/>
</cx:pdf-to-images>
<cx:tesseract language="eng" output-format="hocr"
debug-output="/dev/null"/>
</p:declare-step>
Result
<html xmlns="http://www.w3.org/1999/xhtml"
xmlns:t="http://xproc.org/ns/testsuite/3.0">
<head>
<title/>
<meta http-equiv="Content-Type"
content="text/html;charset=utf-8"/>
<meta name="ocr-system" content="tesseract"/>
</head>
<body>
<div class="ocr_page" id="page_1"
title='image "unknown"; bbox 0 0 2480 3507; ppageno 0; scan_res 70 70'>
<div class="ocr_carea" id="block_1_1"
title="bbox 200 237 586 301">
<p class="ocr_par" id="par_1_1" lang="eng"
title="bbox 200 237 586 301">
<span class="ocr_line" id="line_1_1"
title="bbox 200 237 586 301; baseline 0 -1; x_size 84.166664; x_descenders 21.041666; x_ascenders 21.041666">
<span class="ocrx_word" id="word_1_1"
title="bbox 200 237 373 300; x_wconf 96">PDF</span>
<span class="ocrx_word" id="word_1_2"
title="bbox 404 237 586 301; x_wconf 96">Text</span>
</span>
</p>
</div>
<div class="ocr_carea" id="block_1_2"
title="bbox 191 387 842 431">
<p class="ocr_par" id="par_1_2" lang="eng"
title="bbox 191 387 842 431">
<span class="ocr_line" id="line_1_2"
title="bbox 191 387 842 431; baseline 0 -10; x_size 44; x_descenders 10; x_ascenders 11">
<span class="ocrx_word" id="word_1_3"
title="bbox 191 387 271 421; x_wconf 96">This</span>
<span class="ocrx_word" id="word_1_4"
title="bbox 289 389 316 421; x_wconf 96">is</span>
<span class="ocrx_word" id="word_1_5"
title="bbox 347 398 352 421; x_wconf 96">a</span>
<span class="ocrx_word" id="word_1_6"
title="bbox 369 387 512 431; x_wconf 96">sample</span>
<span class="ocrx_word" id="word_1_7"
title="bbox 531 389 613 421; x_wconf 96">PDF</span>
<span class="ocrx_word" id="word_1_8"
title="bbox 629 387 842 421; x_wconf 95">document.</span>
</span>
</p>
</div>
<div class="ocr_photo" id="block_1_3"
title="bbox 206 578 672 1056"/>
<div class="ocr_carea" id="block_1_4"
title="bbox 190 1212 495 1257">
<p class="ocr_par" id="par_1_3" lang="eng"
title="bbox 190 1212 495 1257">
<span class="ocr_line" id="line_1_3"
title="bbox 190 1212 495 1257; baseline 0 -11; x_size 45; x_descenders 11; x_ascenders 11">
<span class="ocrx_word" id="word_1_9"
title="bbox 190 1212 283 1246; x_wconf 95">With</span>
<span class="ocrx_word"
id="word_1_10"
title="bbox 301 1223 345 1246; x_wconf 95">an</span>
<span class="ocrx_word"
id="word_1_11"
title="bbox 364 1214 495 1257; x_wconf 95">image.</span>
</span>
</p>
</div>
</div>
</body>
</html>
Schematron checks
<s:schema xmlns:s="http://purl.oclc.org/dsdl/schematron"
xmlns:t="http://xproc.org/ns/testsuite/3.0" queryBinding="xslt2">
<s:ns prefix="h" uri="http://www.w3.org/1999/xhtml"/>
<s:pattern>
<s:rule context="/">
<s:assert test="h:html">Wrong document element</s:assert>
</s:rule>
</s:pattern>
<s:pattern>
<s:rule context="/h:html">
<s:assert test="h:body/h:div[@class='ocr_page']">Unexpected page output</s:assert>
<s:assert test="h:body//h:span[@class='ocrx_word'] = 'PDF'">Unexpected output</s:assert>
</s:rule>
</s:pattern>
</s:schema>
Revision history
- 12 Jun 2026, Norm Tovey-Walsh
- Created test.