<?xml version="1.0" encoding="utf-8"?>
<t:test xmlns:t="http://xproc.org/ns/testsuite/3.0"
        xml:base="/tesseract-002.xml"
        name="tesseract-002.xml"
        expected="pass">
   <t:info>
      <t:title>tesseract-002</t:title>
      <t:revision-history>
         <t:revision>
            <t:date>2026-06-12</t:date>
            <t:author>
               <t:name>Norm Tovey-Walsh</t:name>
            </t:author>
            <t:description xmlns="http://www.w3.org/1999/xhtml">
               <p>Created test.</p>
            </t:description>
         </t:revision>
      </t:revision-history>
   </t:info>
   <t:description xmlns="http://www.w3.org/1999/xhtml">
      <p>Tests that hOCR output works</p>
   </t:description>
   <t:pipeline>
      <p:declare-step xmlns:cx="http://xmlcalabash.com/ns/extensions"
                      xmlns:p="http://www.w3.org/ns/xproc"
                      name="main"
                      version="3.0">
         <p:import href="https://xmlcalabash.com/ext/library/pdf-steps.xpl"/>
         <p:import href="https://xmlcalabash.com/ext/library/tesseract.xpl"/>
         <p:output port="result"/>
         <cx:pdf-to-images dpi="300">
            <p:with-input port="source" href="../documents/example.pdf"/>
         </cx:pdf-to-images>
         <cx:tesseract language="eng" output-format="hocr" debug-output="/dev/null"/>
      </p:declare-step>
   </t:pipeline>
   <t:schematron>
      <s:schema xmlns:s="http://purl.oclc.org/dsdl/schematron" queryBinding="xslt2">
         <s:ns prefix="h" uri="http://www.w3.org/1999/xhtml"/>
         <s:pattern>
            <s:rule context="/">
               <s:assert test="h:html">Wrong document element</s:assert>
            </s:rule>
         </s:pattern>
         <s:pattern>
            <s:rule context="/h:html">
               <s:assert test="h:body/h:div[@class='ocr_page']">Unexpected page output</s:assert>
               <s:assert test="h:body//h:span[@class='ocrx_word'] = 'PDF'">Unexpected output</s:assert>
            </s:rule>
         </s:pattern>
      </s:schema>
   </t:schematron>
   <t:result name="tesseract-002.xml">
      <html xmlns="http://www.w3.org/1999/xhtml">
         <head>
            <title/>
            <meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>
            <meta name="ocr-system" content="tesseract"/>
         </head>
         <body>
            <div class="ocr_page"
                 id="page_1"
                 title="image &#34;unknown&#34;; bbox 0 0 2480 3507; ppageno 0; scan_res 70 70">
               <div class="ocr_carea" id="block_1_1" title="bbox 200 237 586 301">
                  <p class="ocr_par"
                     id="par_1_1"
                     lang="eng"
                     title="bbox 200 237 586 301">
                     <span class="ocr_line"
                           id="line_1_1"
                           title="bbox 200 237 586 301; baseline 0 -1; x_size 84.166664; x_descenders 21.041666; x_ascenders 21.041666">
                        <span class="ocrx_word"
                              id="word_1_1"
                              title="bbox 200 237 373 300; x_wconf 96">PDF</span>
                        <span class="ocrx_word"
                              id="word_1_2"
                              title="bbox 404 237 586 301; x_wconf 96">Text</span>
                     </span>
                  </p>
               </div>
               <div class="ocr_carea" id="block_1_2" title="bbox 191 387 842 431">
                  <p class="ocr_par"
                     id="par_1_2"
                     lang="eng"
                     title="bbox 191 387 842 431">
                     <span class="ocr_line"
                           id="line_1_2"
                           title="bbox 191 387 842 431; baseline 0 -10; x_size 44; x_descenders 10; x_ascenders 11">
                        <span class="ocrx_word"
                              id="word_1_3"
                              title="bbox 191 387 271 421; x_wconf 96">This</span>
                        <span class="ocrx_word"
                              id="word_1_4"
                              title="bbox 289 389 316 421; x_wconf 96">is</span>
                        <span class="ocrx_word"
                              id="word_1_5"
                              title="bbox 347 398 352 421; x_wconf 96">a</span>
                        <span class="ocrx_word"
                              id="word_1_6"
                              title="bbox 369 387 512 431; x_wconf 96">sample</span>
                        <span class="ocrx_word"
                              id="word_1_7"
                              title="bbox 531 389 613 421; x_wconf 96">PDF</span>
                        <span class="ocrx_word"
                              id="word_1_8"
                              title="bbox 629 387 842 421; x_wconf 95">document.</span>
                     </span>
                  </p>
               </div>
               <div class="ocr_photo" id="block_1_3" title="bbox 206 578 672 1056"/>
               <div class="ocr_carea" id="block_1_4" title="bbox 190 1212 495 1257">
                  <p class="ocr_par"
                     id="par_1_3"
                     lang="eng"
                     title="bbox 190 1212 495 1257">
                     <span class="ocr_line"
                           id="line_1_3"
                           title="bbox 190 1212 495 1257; baseline 0 -11; x_size 45; x_descenders 11; x_ascenders 11">
                        <span class="ocrx_word"
                              id="word_1_9"
                              title="bbox 190 1212 283 1246; x_wconf 95">With</span>
                        <span class="ocrx_word"
                              id="word_1_10"
                              title="bbox 301 1223 345 1246; x_wconf 95">an</span>
                        <span class="ocrx_word"
                              id="word_1_11"
                              title="bbox 364 1214 495 1257; x_wconf 95">image.</span>
                     </span>
                  </p>
               </div>
            </div>
         </body>
      </html>
   </t:result>
</t:test>
