tesseract-002

Tests that hOCR output works

Test is expected to pass.

The pipeline

<p:declare-step xmlns:cx="http://xmlcalabash.com/ns/extensions"
                xmlns:p="http://www.w3.org/ns/xproc"
                xmlns:t="http://xproc.org/ns/testsuite/3.0" name="main" version="3.0">
   <p:import href="https://xmlcalabash.com/ext/library/pdf-steps.xpl"/>
   <p:import href="https://xmlcalabash.com/ext/library/tesseract.xpl"/>
   <p:output port="result"/>
   <cx:pdf-to-images dpi="300">
      <p:with-input port="source"
                    href="../documents/example.pdf"/>
   </cx:pdf-to-images>
   <cx:tesseract language="eng" output-format="hocr"
                 debug-output="/dev/null"/>
</p:declare-step>

Result

<html xmlns="http://www.w3.org/1999/xhtml"
      xmlns:t="http://xproc.org/ns/testsuite/3.0">
   <head>
      <title/>
      <meta http-equiv="Content-Type"
            content="text/html;charset=utf-8"/>
      <meta name="ocr-system" content="tesseract"/>
   </head>
   <body>
      <div class="ocr_page" id="page_1"
           title='image "unknown"; bbox 0 0 2480 3507; ppageno 0; scan_res 70 70'>
         <div class="ocr_carea" id="block_1_1"
              title="bbox 200 237 586 301">
            <p class="ocr_par" id="par_1_1" lang="eng"
               title="bbox 200 237 586 301">
               <span class="ocr_line" id="line_1_1"
                     title="bbox 200 237 586 301; baseline 0 -1; x_size 84.166664; x_descenders 21.041666; x_ascenders 21.041666">
                  <span class="ocrx_word" id="word_1_1"
                        title="bbox 200 237 373 300; x_wconf 96">PDF</span>
                  <span class="ocrx_word" id="word_1_2"
                        title="bbox 404 237 586 301; x_wconf 96">Text</span>
               </span>
            </p>
         </div>
         <div class="ocr_carea" id="block_1_2"
              title="bbox 191 387 842 431">
            <p class="ocr_par" id="par_1_2" lang="eng"
               title="bbox 191 387 842 431">
               <span class="ocr_line" id="line_1_2"
                     title="bbox 191 387 842 431; baseline 0 -10; x_size 44; x_descenders 10; x_ascenders 11">
                  <span class="ocrx_word" id="word_1_3"
                        title="bbox 191 387 271 421; x_wconf 96">This</span>
                  <span class="ocrx_word" id="word_1_4"
                        title="bbox 289 389 316 421; x_wconf 96">is</span>
                  <span class="ocrx_word" id="word_1_5"
                        title="bbox 347 398 352 421; x_wconf 96">a</span>
                  <span class="ocrx_word" id="word_1_6"
                        title="bbox 369 387 512 431; x_wconf 96">sample</span>
                  <span class="ocrx_word" id="word_1_7"
                        title="bbox 531 389 613 421; x_wconf 96">PDF</span>
                  <span class="ocrx_word" id="word_1_8"
                        title="bbox 629 387 842 421; x_wconf 95">document.</span>
               </span>
            </p>
         </div>
         <div class="ocr_photo" id="block_1_3"
              title="bbox 206 578 672 1056"/>
         <div class="ocr_carea" id="block_1_4"
              title="bbox 190 1212 495 1257">
            <p class="ocr_par" id="par_1_3" lang="eng"
               title="bbox 190 1212 495 1257">
               <span class="ocr_line" id="line_1_3"
                     title="bbox 190 1212 495 1257; baseline 0 -11; x_size 45; x_descenders 11; x_ascenders 11">
                  <span class="ocrx_word" id="word_1_9"
                        title="bbox 190 1212 283 1246; x_wconf 95">With</span>
                  <span class="ocrx_word"
                        id="word_1_10"
                        title="bbox 301 1223 345 1246; x_wconf 95">an</span>
                  <span class="ocrx_word"
                        id="word_1_11"
                        title="bbox 364 1214 495 1257; x_wconf 95">image.</span>
               </span>
            </p>
         </div>
      </div>
   </body>
</html>

Schematron checks

<s:schema xmlns:s="http://purl.oclc.org/dsdl/schematron"
          xmlns:t="http://xproc.org/ns/testsuite/3.0" queryBinding="xslt2">
   <s:ns prefix="h" uri="http://www.w3.org/1999/xhtml"/>
   <s:pattern>
      <s:rule context="/">
         <s:assert test="h:html">Wrong document element</s:assert>
      </s:rule>
   </s:pattern>
   <s:pattern>
      <s:rule context="/h:html">
         <s:assert test="h:body/h:div[@class='ocr_page']">Unexpected page output</s:assert>
         <s:assert test="h:body//h:span[@class='ocrx_word'] = 'PDF'">Unexpected output</s:assert>
      </s:rule>
   </s:pattern>
</s:schema>

Revision history

12 Jun 2026, Norm Tovey-Walsh
Created test.