about summary refs log tree commit diff
path: root/pkgs/applications/misc
diff options
context:
space:
mode:
authorDaniel Fullmer <danielrf12@gmail.com>2019-09-13 15:53:03 -0400
committerMatthieu Coudron <coudron@iij.ad.jp>2019-09-14 21:52:06 +0900
commit740d4c22ecccf94b5b55c449437069a96a005c6c (patch)
tree7c5d37de364c697768813a316ce85c4f3b78006f /pkgs/applications/misc
parent9128fe19cc4aa7b3ac285494f15a4060ca2d3f7c (diff)
k2pdfopt: Fix build and clean up
Diffstat (limited to 'pkgs/applications/misc')
-rw-r--r--pkgs/applications/misc/k2pdfopt/default.nix62
-rw-r--r--pkgs/applications/misc/k2pdfopt/leptonica-CVE-2018-3836.patch95
-rw-r--r--pkgs/applications/misc/k2pdfopt/leptonica.patch254
-rw-r--r--pkgs/applications/misc/k2pdfopt/mupdf.patch1060
-rw-r--r--pkgs/applications/misc/k2pdfopt/tesseract.patch678
5 files changed, 1991 insertions, 158 deletions
diff --git a/pkgs/applications/misc/k2pdfopt/default.nix b/pkgs/applications/misc/k2pdfopt/default.nix
index 9391fe88c5eaa..58bd200e713c3 100644
--- a/pkgs/applications/misc/k2pdfopt/default.nix
+++ b/pkgs/applications/misc/k2pdfopt/default.nix
@@ -36,67 +36,19 @@ stdenv.mkDerivation rec {
 
   buildInputs =
   let
+    #  The patches below were constructed by taking the files from k2pdfopt in
+    #  the {mupdf,leptonica,tesseract}_mod/ directories, replacing the
+    #  corresponding files in the respective source trees, resolving any errors
+    #  with more recent versions of these depencencies, and running diff.
     mupdf_modded = mupdf.overrideAttrs (attrs: {
-      # Excluded the pdf-*.c files, since they mostly just broke the #includes
-      prePatch = ''
-        cp ${src}/mupdf_mod/{font,stext-device,string}.c source/fitz/
-        cp ${src}/mupdf_mod/font-win32.c source/pdf/
-      '';
+      patches = attrs.patches ++ [ ./mupdf.patch ]; # Last verified with mupdf 1.14.0
     });
-
     leptonica_modded = leptonica.overrideAttrs (attrs: {
-      name = "leptonica-1.74.4";
-      # Modified source files apply to this particular version of leptonica
-      version = "1.74.4";
-
-      src = fetchurl {
-        url = "http://www.leptonica.org/source/leptonica-1.74.4.tar.gz";
-        sha256 = "0fw39amgyv8v6nc7x8a4c7i37dm04i6c5zn62d24bgqnlhk59hr9";
-      };
-
-      prePatch = ''
-        cp ${src}/leptonica_mod/{allheaders.h,dewarp2.c,leptwin.c} src/
-      '';
-      patches = [
-        # stripped down copy of upstream commit b88c821f8d347bce0aea86d606c710303919f3d2
-        ./leptonica-CVE-2018-3836.patch
-        (fetchpatch {
-          # CVE-2018-7186
-          url = "https://github.com/DanBloomberg/leptonica/commit/"
-              + "ee301cb2029db8a6289c5295daa42bba7715e99a.patch";
-          sha256 = "0cgb7mvz2px1rg5i80wk1wxxjvzjga617d8q6j7qygkp7jm6495d";
-        })
-        (fetchpatch {
-          # CVE-2018-7247
-          url = "https://github.com/DanBloomberg/leptonica/commit/"
-              + "c1079bb8e77cdd426759e466729917ca37a3ed9f.patch";
-          sha256 = "1z4iac5gwqggh7aa8cvyp6nl9fwd1v7wif26caxc9y5qr3jj34qf";
-        })
-        (fetchpatch {
-          # CVE-2018-7440
-          url = "https://github.com/DanBloomberg/leptonica/commit/"
-              + "49ecb6c2dfd6ed5078c62f4a8eeff03e3beced3b.patch";
-          sha256 = "1hjmva98iaw9xj7prg7aimykyayikcwnk4hk0380007hqb35lqmy";
-        })
-      ];
+      patches = [ ./leptonica.patch ]; # Last verified with leptonica 1.78.0
     });
     tesseract_modded = tesseract4.override {
       tesseractBase = tesseract4.tesseractBase.overrideAttrs (_: {
-        prePatch = ''
-          cp ${src}/tesseract_mod/baseapi.{h,cpp} src/api/
-          cp ${src}/tesseract_mod/ccutil.{h,cpp} src/ccutil/
-          cp ${src}/tesseract_mod/genericvector.h src/ccutil/
-          cp ${src}/tesseract_mod/input.cpp src/lstm/
-          cp ${src}/tesseract_mod/lstmrecognizer.cpp src/lstm/
-          cp ${src}/tesseract_mod/mainblk.cpp src/ccutil/
-          cp ${src}/tesseract_mod/params.cpp src/ccutil/
-          cp ${src}/tesseract_mod/serialis.{h,cpp} src/ccutil/
-          cp ${src}/tesseract_mod/tesscapi.cpp src/api/
-          cp ${src}/tesseract_mod/tessdatamanager.cpp src/ccstruct/
-          cp ${src}/tesseract_mod/tessedit.cpp src/ccmain/
-          cp ${src}/include_mod/{tesseract.h,leptonica.h} src/api/
-        '';
-        patches = [ ./tesseract.patch ];
+        patches = [ ./tesseract.patch ]; # Last verified with tesseract 1.4
       });
     };
   in
diff --git a/pkgs/applications/misc/k2pdfopt/leptonica-CVE-2018-3836.patch b/pkgs/applications/misc/k2pdfopt/leptonica-CVE-2018-3836.patch
deleted file mode 100644
index f1b4170fbaae9..0000000000000
--- a/pkgs/applications/misc/k2pdfopt/leptonica-CVE-2018-3836.patch
+++ /dev/null
@@ -1,95 +0,0 @@
---- a/src/allheaders.h
-+++ b/src/allheaders.h
-@@ -2600,6 +2600,7 @@
- LEPT_DLL extern char * stringReverse ( const char *src );
- LEPT_DLL extern char * strtokSafe ( char *cstr, const char *seps, char **psaveptr );
- LEPT_DLL extern l_int32 stringSplitOnToken ( char *cstr, const char *seps, char **phead, char **ptail );
-+LEPT_DLL extern l_int32 stringCheckForChars ( const char *src, const char *chars, l_int32 *pfound );
- LEPT_DLL extern char * stringRemoveChars ( const char *src, const char *remchars );
- LEPT_DLL extern l_int32 stringFindSubstr ( const char *src, const char *sub, l_int32 *ploc );
- LEPT_DLL extern char * stringReplaceSubstr ( const char *src, const char *sub1, const char *sub2, l_int32 *pfound, l_int32 *ploc );
---- a/src/gplot.c
-+++ b/src/gplot.c
-@@ -141,9 +141,10 @@
-             const char  *xlabel,
-             const char  *ylabel)
- {
--char   *newroot;
--char    buf[L_BUF_SIZE];
--GPLOT  *gplot;
-+char    *newroot;
-+char     buf[L_BUF_SIZE];
-+l_int32  badchar;
-+GPLOT   *gplot;
- 
-     PROCNAME("gplotCreate");
- 
-@@ -152,6 +153,9 @@
-     if (outformat != GPLOT_PNG && outformat != GPLOT_PS &&
-         outformat != GPLOT_EPS && outformat != GPLOT_LATEX)
-         return (GPLOT *)ERROR_PTR("outformat invalid", procName, NULL);
-+    stringCheckForChars(rootname, "`;&|><\"?*", &badchar);
-+    if (badchar)  /* danger of command injection */
-+        return (GPLOT *)ERROR_PTR("invalid rootname", procName, NULL);
- 
-     if ((gplot = (GPLOT *)LEPT_CALLOC(1, sizeof(GPLOT))) == NULL)
-         return (GPLOT *)ERROR_PTR("gplot not made", procName, NULL);
---- a/src/utils2.c
-+++ b/src/utils2.c
-@@ -42,6 +42,7 @@
-  *           l_int32    stringSplitOnToken()
-  *
-  *       Find and replace string and array procs
-+ *           l_int32    stringCheckForChars()
-  *           char      *stringRemoveChars()
-  *           l_int32    stringFindSubstr()
-  *           char      *stringReplaceSubstr()
-@@ -701,6 +702,48 @@
- /*--------------------------------------------------------------------*
-  *                       Find and replace procs                       *
-  *--------------------------------------------------------------------*/
-+/*!
-+ * \brief   stringCheckForChars()
-+ *
-+ * \param[in]    src      input string; can be of zero length
-+ * \param[in]    chars    string of chars to be searched for in %src
-+ * \param[out]   pfound   1 if any characters are found; 0 otherwise
-+ * \return  0 if OK, 1 on error
-+ *
-+ * <pre>
-+ * Notes:
-+ *      (1) This can be used to sanitize an operation by checking for
-+ *          special characters that don't belong in a string.
-+ * </pre>
-+ */
-+l_int32
-+stringCheckForChars(const char  *src,
-+                    const char  *chars,
-+                    l_int32     *pfound)
-+{
-+char     ch;
-+l_int32  i, n;
-+
-+    PROCNAME("stringCheckForChars");
-+
-+    if (!pfound)
-+        return ERROR_INT("&found not defined", procName, 1);
-+    *pfound = FALSE;
-+    if (!src || !chars)
-+        return ERROR_INT("src and chars not both defined", procName, 1);
-+
-+    n = strlen(src);
-+    for (i = 0; i < n; i++) {
-+        ch = src[i];
-+        if (strchr(chars, ch)) {
-+            *pfound = TRUE;
-+            break;
-+        }
-+    }
-+    return 0;
-+}
-+
-+
- /*!
-  * \brief   stringRemoveChars()
-  *
diff --git a/pkgs/applications/misc/k2pdfopt/leptonica.patch b/pkgs/applications/misc/k2pdfopt/leptonica.patch
new file mode 100644
index 0000000000000..dfab99fd0130d
--- /dev/null
+++ b/pkgs/applications/misc/k2pdfopt/leptonica.patch
@@ -0,0 +1,254 @@
+From 8c11a20925686855023df90ed477957c7d7fe91e Mon Sep 17 00:00:00 2001
+From: Daniel Fullmer <danielrf12@gmail.com>
+Date: Fri, 13 Sep 2019 15:54:21 -0400
+Subject: [PATCH] Willus mod for k2pdfopt
+
+---
+ src/allheaders.h |   4 ++
+ src/dewarp2.c    | 106 ++++++++++++++++++++++++++++++++++++++++++-----
+ src/leptwin.c    |   6 ++-
+ 3 files changed, 104 insertions(+), 12 deletions(-)
+
+diff --git a/src/allheaders.h b/src/allheaders.h
+index e68eff1..b3cc729 100644
+--- a/src/allheaders.h
++++ b/src/allheaders.h
+@@ -669,6 +669,10 @@ LEPT_DLL extern L_DEWARPA * dewarpaReadMem ( const l_uint8 *data, size_t size );
+ LEPT_DLL extern l_ok dewarpaWrite ( const char *filename, L_DEWARPA *dewa );
+ LEPT_DLL extern l_ok dewarpaWriteStream ( FILE *fp, L_DEWARPA *dewa );
+ LEPT_DLL extern l_ok dewarpaWriteMem ( l_uint8 **pdata, size_t *psize, L_DEWARPA *dewa );
++/* WILLUS MOD */
++    LEPT_DLL extern l_int32 dewarpBuildPageModel_ex ( L_DEWARP *dew, const char *debugfile,l_int32 fit_order );
++    LEPT_DLL extern l_int32 dewarpFindVertDisparity_ex ( L_DEWARP *dew, PTAA *ptaa, l_int32 rotflag,l_int32 fit_order );
++    LEPT_DLL extern l_int32 dewarpBuildLineModel_ex ( L_DEWARP *dew, l_int32 opensize, const char *debugfile,l_int32 fit_order );
+ LEPT_DLL extern l_ok dewarpBuildPageModel ( L_DEWARP *dew, const char *debugfile );
+ LEPT_DLL extern l_ok dewarpFindVertDisparity ( L_DEWARP *dew, PTAA *ptaa, l_int32 rotflag );
+ LEPT_DLL extern l_ok dewarpFindHorizDisparity ( L_DEWARP *dew, PTAA *ptaa );
+diff --git a/src/dewarp2.c b/src/dewarp2.c
+index 220eec1..2e29500 100644
+--- a/src/dewarp2.c
++++ b/src/dewarp2.c
+@@ -144,9 +144,17 @@ static const l_float32   L_ALLOWED_W_FRACT = 0.05;  /* no bigger */
+  *          longest textlines.
+  * </pre>
+  */
++/* WILLUS MOD */
+ l_ok
+-dewarpBuildPageModel(L_DEWARP    *dew,
+-                     const char  *debugfile)
++dewarpBuildPageModel(L_DEWARP *dew,const char *debugfile)
++{
++return(dewarpBuildPageModel_ex(dew,debugfile,2));
++}
++
++l_ok
++dewarpBuildPageModel_ex(L_DEWARP    *dew,
++                     const char  *debugfile,
++                     l_int32 fit_order)
+ {
+ l_int32  linecount, topline, botline, ret;
+ PIX     *pixs, *pix1, *pix2, *pix3;
+@@ -225,7 +233,7 @@ PTAA    *ptaa1, *ptaa2;
+         /* Get the sampled vertical disparity from the textline centers.
+          * The disparity array will push pixels vertically so that each
+          * textline is flat and centered at the y-position of the mid-point. */
+-    if (dewarpFindVertDisparity(dew, ptaa2, 0) != 0) {
++    if (dewarpFindVertDisparity_ex(dew, ptaa2, 0, fit_order) != 0) {
+         L_WARNING("vertical disparity not built\n", procName);
+         ptaaDestroy(&ptaa2);
+         return 1;
+@@ -290,13 +298,24 @@ PTAA    *ptaa1, *ptaa2;
+  *          a pdf.  Non-pix debug output goes to /tmp.
+  * </pre>
+  */
++/* WILLUS MOD */
+ l_ok
+ dewarpFindVertDisparity(L_DEWARP  *dew,
+                         PTAA      *ptaa,
+                         l_int32    rotflag)
+ {
++return(dewarpFindVertDisparity_ex(dew,ptaa,rotflag,2));
++}
++/* WILLUS MOD -- add cubic and quartic fits and ..._ex functions */
++l_int32
++dewarpFindVertDisparity_ex(L_DEWARP  *dew,
++                        PTAA      *ptaa,
++                        l_int32    rotflag,
++                        l_int32    fit_order)
++{
+ l_int32     i, j, nlines, npts, nx, ny, sampling;
+-l_float32   c0, c1, c2, x, y, midy, val, medval, meddev, minval, maxval;
++/* WILLUS MOD */
++l_float32   c0, c1, c2, c3, c4, x, y, midy, val, medval, meddev, minval, maxval;
+ l_float32  *famidys;
+ NUMA       *nax, *nafit, *nacurve0, *nacurve1, *nacurves;
+ NUMA       *namidy, *namidys, *namidysi;
+@@ -304,11 +323,22 @@ PIX        *pix1, *pix2, *pixcirc, *pixdb;
+ PTA        *pta, *ptad, *ptacirc;
+ PTAA       *ptaa0, *ptaa1, *ptaa2, *ptaa3, *ptaa4, *ptaa5, *ptaat;
+ FPIX       *fpix;
++/* WILLUS MOD */
++l_int32 fit_order1,fit_order2;
+ 
+     PROCNAME("dewarpFindVertDisparity");
+ 
+     if (!dew)
+         return ERROR_INT("dew not defined", procName, 1);
++/* WILLUS MOD */
++    if (fit_order < 10)
++        fit_order1 = fit_order2 = fit_order;
++    else
++        {
++        fit_order1=fit_order % 10;
++        fit_order2=fit_order / 10;
++        fit_order2=fit_order2 % 10;
++        }
+     dew->vsuccess = 0;
+     if (!ptaa)
+         return ERROR_INT("ptaa not defined", procName, 1);
+@@ -331,12 +361,32 @@ FPIX       *fpix;
+     pixdb = (rotflag) ? pixRotateOrth(dew->pixs, 1) : pixClone(dew->pixs);
+     for (i = 0; i < nlines; i++) {  /* for each line */
+         pta = ptaaGetPta(ptaa, i, L_CLONE);
+-        ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL);
+-        numaAddNumber(nacurve0, c2);
++/* WILLUS MOD */
++if (fit_order1>3)
++    {
++    ptaGetQuarticLSF(pta, &c4, &c3, &c2, &c1, &c0, NULL);
++    numaAddNumber(nacurve0, c4);
++    }
++else if (fit_order1==3)
++    {
++    ptaGetCubicLSF(pta, &c3, &c2, &c1, &c0, NULL);
++    numaAddNumber(nacurve0, c3);
++    }
++else
++    {
++    ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL);
++    numaAddNumber(nacurve0, c2);
++    }
+         ptad = ptaCreate(nx);
+         for (j = 0; j < nx; j++) {  /* uniformly sampled in x */
+              x = j * sampling;
+-             applyQuadraticFit(c2, c1, c0, x, &y);
++/* WILLUS MOD */
++if (fit_order1>3)
++    applyQuarticFit(c4, c3, c2, c1, c0, x, &y);
++else if (fit_order1==3)
++    applyCubicFit(c3, c2, c1, c0, x, &y);
++else
++    applyQuadraticFit(c2, c1, c0, x, &y);
+              ptaAddPt(ptad, x, y);
+         }
+         ptaaAddPta(ptaa0, ptad, L_INSERT);
+@@ -350,7 +400,13 @@ FPIX       *fpix;
+         for (i = 0; i < nlines; i++) {
+             pta = ptaaGetPta(ptaa, i, L_CLONE);
+             ptaGetArrays(pta, &nax, NULL);
+-            ptaGetQuadraticLSF(pta, NULL, NULL, NULL, &nafit);
++/* WILLUS MOD */
++if (fit_order1>3)
++ptaGetQuarticLSF(pta, NULL, NULL, NULL, NULL, NULL, &nafit);
++else if (fit_order1==3)
++ptaGetCubicLSF(pta, NULL, NULL, NULL, NULL, &nafit);
++else
++ptaGetQuadraticLSF(pta, NULL, NULL, NULL, &nafit);
+             ptad = ptaCreateFromNuma(nax, nafit);
+             ptaaAddPta(ptaat, ptad, L_INSERT);
+             ptaDestroy(&pta);
+@@ -494,11 +550,24 @@ FPIX       *fpix;
+     ptaa5 = ptaaCreate(nx);  /* uniformly sampled across full height of image */
+     for (j = 0; j < nx; j++) {  /* for each column */
+         pta = ptaaGetPta(ptaa4, j, L_CLONE);
+-        ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL);
++/* WILLUS MOD */
++/* Order higher than 2 can cause a little craziness here. */
++if (fit_order2>3)
++    ptaGetQuarticLSF(pta, &c4, &c3, &c2, &c1, &c0, NULL);
++else if (fit_order2==3)
++    ptaGetCubicLSF(pta, &c3, &c2, &c1, &c0, NULL);
++else
++    ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL);
+         ptad = ptaCreate(ny);
+         for (i = 0; i < ny; i++) {  /* uniformly sampled in y */
+              y = i * sampling;
+-             applyQuadraticFit(c2, c1, c0, y, &val);
++/* WILLUS MOD */
++if (fit_order2>3)
++    applyQuarticFit(c4, c3, c2, c1, c0, y, &val);
++else if (fit_order2==3)
++    applyCubicFit(c3, c2, c1, c0, y, &val);
++else
++    applyQuadraticFit(c2, c1, c0, y, &val);
+              ptaAddPt(ptad, y, val);
+         }
+         ptaaAddPta(ptaa5, ptad, L_INSERT);
+@@ -1602,11 +1671,21 @@ FPIX      *fpix;
+  *          See notes there.
+  * </pre>
+  */
++/* WILLUS MOD */
+ l_ok
+ dewarpBuildLineModel(L_DEWARP    *dew,
+                      l_int32      opensize,
+                      const char  *debugfile)
+ {
++return(dewarpBuildLineModel_ex(dew,opensize,debugfile,2));
++}
++
++l_int32
++dewarpBuildLineModel_ex(L_DEWARP    *dew,
++                     l_int32      opensize,
++                     const char  *debugfile,
++                     l_int32 fit_order)
++{
+ char     buf[64];
+ l_int32  i, j, bx, by, ret, nlines;
+ BOXA    *boxa;
+@@ -1695,6 +1774,8 @@ PTAA    *ptaa1, *ptaa2;
+ 
+             /* Remove all lines that are not at least 0.75 times the length
+              * of the longest line. */
++/* WILLUS MOD */
++/*
+         ptaa2 = dewarpRemoveShortLines(pix, ptaa1, 0.75, DEBUG_SHORT_LINES);
+         if (debugfile) {
+             pix1 = pixConvertTo32(pix);
+@@ -1704,6 +1785,8 @@ PTAA    *ptaa1, *ptaa2;
+             pixDestroy(&pix1);
+             pixDestroy(&pix2);
+         }
++*/
++ptaa2=ptaa1;
+         ptaaDestroy(&ptaa1);
+         nlines = ptaaGetCount(ptaa2);
+         if (nlines < dew->minlines) {
+@@ -1717,7 +1800,8 @@ PTAA    *ptaa1, *ptaa2;
+              * centers.  The disparity array will push pixels vertically
+              * so that each line is flat and centered at the y-position
+              * of the mid-point. */
+-        ret = dewarpFindVertDisparity(dew, ptaa2, 1 - i);
++/* WILLUS MOD */
++        ret = dewarpFindVertDisparity_ex(dew, ptaa2, 1 - i, fit_order);
+ 
+             /* If i == 0, move the result to the horizontal disparity,
+              * rotating it back by -90 degrees. */
+diff --git a/src/leptwin.c b/src/leptwin.c
+index 72643a0..573d33e 100644
+--- a/src/leptwin.c
++++ b/src/leptwin.c
+@@ -364,5 +364,9 @@ PIXCMAP   *cmap;
+ 
+     return hBitmap;
+ }
+-
++#else
++/* willus mod: Avoid weird issue with OS/X library archiver when there are no symbols */
++int leptwin_my_empty_func(void);
++int leptwin_my_empty_func(void)
++{return(0);}
+ #endif   /* _WIN32 */
+-- 
+2.22.0
+
diff --git a/pkgs/applications/misc/k2pdfopt/mupdf.patch b/pkgs/applications/misc/k2pdfopt/mupdf.patch
new file mode 100644
index 0000000000000..f7c04d42a71d5
--- /dev/null
+++ b/pkgs/applications/misc/k2pdfopt/mupdf.patch
@@ -0,0 +1,1060 @@
+From 3d763f84872351c250ffea26150e73b02b8f4c6f Mon Sep 17 00:00:00 2001
+From: Daniel Fullmer <danielrf12@gmail.com>
+Date: Fri, 13 Sep 2019 15:11:45 -0400
+Subject: [PATCH] Willus mod for k2pdfopt
+
+---
+ source/fitz/filter-basic.c |   3 +
+ source/fitz/font-win32.c   | 866 +++++++++++++++++++++++++++++++++++++
+ source/fitz/font.c         |   3 +
+ source/fitz/stext-device.c |   5 +
+ source/fitz/string.c       |   5 +
+ source/pdf/pdf-annot.c     |  14 +-
+ source/pdf/pdf-link.c      |   3 +
+ source/pdf/pdf-parse.c     |   5 +
+ source/pdf/pdf-xref.c      |   9 +
+ 9 files changed, 912 insertions(+), 1 deletion(-)
+ create mode 100644 source/fitz/font-win32.c
+
+diff --git a/source/fitz/filter-basic.c b/source/fitz/filter-basic.c
+index 0713a62e7..b8ef4d292 100644
+--- a/source/fitz/filter-basic.c
++++ b/source/fitz/filter-basic.c
+@@ -259,7 +259,10 @@ look_for_endstream:
+ 	if (!state->warned)
+ 	{
+ 		state->warned = 1;
++/* willus mod -- no warning */
++/*
+ 		fz_warn(ctx, "PDF stream Length incorrect");
++*/
+ 	}
+ 	return *stm->rp++;
+ }
+diff --git a/source/fitz/font-win32.c b/source/fitz/font-win32.c
+new file mode 100644
+index 000000000..45de8cfd3
+--- /dev/null
++++ b/source/fitz/font-win32.c
+@@ -0,0 +1,866 @@
++/*
++** Routines to access MS Windows system fonts.
++** From sumatra PDF distro.
++** Modified for MuPDF v1.9a by willus.com
++*/
++#include "mupdf/pdf.h"
++
++/*
++	Which fonts are embedded is based on a few preprocessor definitions.
++
++	The base 14 fonts are always embedded.
++	For CJK font substitution we embed DroidSansFallback.
++
++	Set NOCJK to skip all CJK support (this also omits embedding the CJK CMaps)
++	Set NOCJKFONT to skip the embedded CJK font.
++	Set NOCJKFULL to embed a smaller CJK font without CJK Extension A support.
++*/
++
++#ifdef NOCJK
++#define NOCJKFONT
++#endif
++
++/* SumatraPDF: also load fonts included with Windows */
++#ifdef _WIN32
++
++#ifndef UNICODE
++#define UNICODE
++#endif
++#ifndef _UNICODE
++#define _UNICODE
++#endif
++
++#include <windows.h>
++
++// TODO: Use more of FreeType for TTF parsing (for performance reasons,
++//       the fonts can't be parsed completely, though)
++#include <ft2build.h>
++#include FT_TRUETYPE_IDS_H
++#include FT_TRUETYPE_TAGS_H
++
++#define TTC_VERSION1	0x00010000
++#define TTC_VERSION2	0x00020000
++
++#define MAX_FACENAME	128
++
++// Note: the font face must be the first field so that the structure
++//       can be treated like a simple string for searching
++typedef struct pdf_fontmapMS_s
++{
++	char fontface[MAX_FACENAME];
++	char fontpath[MAX_PATH];
++	int index;
++} pdf_fontmapMS;
++
++typedef struct pdf_fontlistMS_s
++{
++	pdf_fontmapMS *fontmap;
++	int len;
++	int cap;
++} pdf_fontlistMS;
++
++typedef struct _tagTT_OFFSET_TABLE
++{
++	ULONG	uVersion;
++	USHORT	uNumOfTables;
++	USHORT	uSearchRange;
++	USHORT	uEntrySelector;
++	USHORT	uRangeShift;
++} TT_OFFSET_TABLE;
++
++typedef struct _tagTT_TABLE_DIRECTORY
++{
++	ULONG	uTag;				//table name
++	ULONG	uCheckSum;			//Check sum
++	ULONG	uOffset;			//Offset from beginning of file
++	ULONG	uLength;			//length of the table in bytes
++} TT_TABLE_DIRECTORY;
++
++typedef struct _tagTT_NAME_TABLE_HEADER
++{
++	USHORT	uFSelector;			//format selector. Always 0
++	USHORT	uNRCount;			//Name Records count
++	USHORT	uStorageOffset;		//Offset for strings storage, from start of the table
++} TT_NAME_TABLE_HEADER;
++
++typedef struct _tagTT_NAME_RECORD
++{
++	USHORT	uPlatformID;
++	USHORT	uEncodingID;
++	USHORT	uLanguageID;
++	USHORT	uNameID;
++	USHORT	uStringLength;
++	USHORT	uStringOffset;	//from start of storage area
++} TT_NAME_RECORD;
++
++typedef struct _tagFONT_COLLECTION
++{
++	ULONG	Tag;
++	ULONG	Version;
++	ULONG	NumFonts;
++} FONT_COLLECTION;
++
++static struct {
++	char *name;
++	char *pattern;
++} baseSubstitutes[] = {
++	{ "Courier", "CourierNewPSMT" },
++	{ "Courier-Bold", "CourierNewPS-BoldMT" },
++	{ "Courier-Oblique", "CourierNewPS-ItalicMT" },
++	{ "Courier-BoldOblique", "CourierNewPS-BoldItalicMT" },
++	{ "Helvetica", "ArialMT" },
++	{ "Helvetica-Bold", "Arial-BoldMT" },
++	{ "Helvetica-Oblique", "Arial-ItalicMT" },
++	{ "Helvetica-BoldOblique", "Arial-BoldItalicMT" },
++	{ "Times-Roman", "TimesNewRomanPSMT" },
++	{ "Times-Bold", "TimesNewRomanPS-BoldMT" },
++	{ "Times-Italic", "TimesNewRomanPS-ItalicMT" },
++	{ "Times-BoldItalic", "TimesNewRomanPS-BoldItalicMT" },
++	{ "Symbol", "SymbolMT" },
++};
++static const char *base_font_names[][10] =
++{
++	{ "Courier", "CourierNew", "CourierNewPSMT", NULL },
++	{ "Courier-Bold", "CourierNew,Bold", "Courier,Bold",
++		"CourierNewPS-BoldMT", "CourierNew-Bold", NULL },
++	{ "Courier-Oblique", "CourierNew,Italic", "Courier,Italic",
++		"CourierNewPS-ItalicMT", "CourierNew-Italic", NULL },
++	{ "Courier-BoldOblique", "CourierNew,BoldItalic", "Courier,BoldItalic",
++		"CourierNewPS-BoldItalicMT", "CourierNew-BoldItalic", NULL },
++	{ "Helvetica", "ArialMT", "Arial", NULL },
++	{ "Helvetica-Bold", "Arial-BoldMT", "Arial,Bold", "Arial-Bold",
++		"Helvetica,Bold", NULL },
++	{ "Helvetica-Oblique", "Arial-ItalicMT", "Arial,Italic", "Arial-Italic",
++		"Helvetica,Italic", "Helvetica-Italic", NULL },
++	{ "Helvetica-BoldOblique", "Arial-BoldItalicMT",
++		"Arial,BoldItalic", "Arial-BoldItalic",
++		"Helvetica,BoldItalic", "Helvetica-BoldItalic", NULL },
++	{ "Times-Roman", "TimesNewRomanPSMT", "TimesNewRoman",
++		"TimesNewRomanPS", NULL },
++	{ "Times-Bold", "TimesNewRomanPS-BoldMT", "TimesNewRoman,Bold",
++		"TimesNewRomanPS-Bold", "TimesNewRoman-Bold", NULL },
++	{ "Times-Italic", "TimesNewRomanPS-ItalicMT", "TimesNewRoman,Italic",
++		"TimesNewRomanPS-Italic", "TimesNewRoman-Italic", NULL },
++	{ "Times-BoldItalic", "TimesNewRomanPS-BoldItalicMT",
++		"TimesNewRoman,BoldItalic", "TimesNewRomanPS-BoldItalic",
++		"TimesNewRoman-BoldItalic", NULL },
++	{ "Symbol", "Symbol,Italic", "Symbol,Bold", "Symbol,BoldItalic",
++		"SymbolMT", "SymbolMT,Italic", "SymbolMT,Bold", "SymbolMT,BoldItalic", NULL },
++	{ "ZapfDingbats", NULL }
++};
++
++static pdf_fontlistMS fontlistMS =
++{
++	NULL,
++	0,
++	0,
++};
++static int strcmp_ignore_space(const char *a, const char *b);
++static const char *clean_font_name(const char *fontname);
++static const char *pdf_clean_base14_name(const char *fontname);
++
++static inline USHORT BEtoHs(USHORT x)
++{
++	BYTE *data = (BYTE *)&x;
++	return (data[0] << 8) | data[1];
++}
++
++static inline ULONG BEtoHl(ULONG x)
++{
++	BYTE *data = (BYTE *)&x;
++	return (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3];
++}
++
++static int strcmp_ignore_space(const char *a, const char *b)
++{
++	while (1)
++	{
++		while (*a == ' ')
++			a++;
++		while (*b == ' ')
++			b++;
++		if (*a != *b)
++			return 1;
++		if (*a == 0)
++			return *a != *b;
++		if (*b == 0)
++			return *a != *b;
++		a++;
++		b++;
++	}
++}
++
++/* A little bit more sophisticated name matching so that e.g. "EurostileExtended"
++   matches "EurostileExtended-Roman" or "Tahoma-Bold,Bold" matches "Tahoma-Bold" */
++static int
++lookup_compare(const void *elem1, const void *elem2)
++{
++	const char *val1 = elem1;
++	const char *val2 = elem2;
++	int len1 = strlen(val1);
++	int len2 = strlen(val2);
++
++	if (len1 != len2)
++	{
++		const char *rest = len1 > len2 ? val1 + len2 : val2 + len1;
++		if (',' == *rest || !_stricmp(rest, "-roman"))
++			return _strnicmp(val1, val2, fz_mini(len1, len2));
++	}
++
++	return _stricmp(val1, val2);
++}
++
++static void
++remove_spaces(char *srcDest)
++{
++	char *dest;
++
++	for (dest = srcDest; *srcDest; srcDest++)
++		if (*srcDest != ' ')
++			*dest++ = *srcDest;
++	*dest = '\0';
++}
++
++static int
++str_ends_with(const char *str, const char *end)
++{
++	size_t len1 = strlen(str);
++	size_t len2 = strlen(end);
++
++	return len1 >= len2 && !strcmp(str + len1 - len2, end);
++}
++
++static pdf_fontmapMS *
++pdf_find_windows_font_path(const char *fontname)
++{
++	return bsearch(fontname, fontlistMS.fontmap, fontlistMS.len, sizeof(pdf_fontmapMS), lookup_compare);
++}
++
++/* source and dest can be same */
++static void
++decode_unicode_BE(fz_context *ctx, char *source, int sourcelen, char *dest, int destlen)
++{
++	WCHAR *tmp;
++	int converted, i;
++
++	if (sourcelen % 2 != 0)
++		fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : invalid unicode string");
++
++	tmp = fz_malloc_array(ctx, sourcelen / 2 + 1, sizeof(WCHAR));
++	for (i = 0; i < sourcelen / 2; i++)
++		tmp[i] = BEtoHs(((WCHAR *)source)[i]);
++	tmp[sourcelen / 2] = '\0';
++
++	converted = WideCharToMultiByte(CP_UTF8, 0, tmp, -1, dest, destlen, NULL, NULL);
++	fz_free(ctx, tmp);
++	if (!converted)
++		fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : invalid unicode string");
++}
++
++static void
++decode_platform_string(fz_context *ctx, int platform, int enctype, char *source, int sourcelen, char *dest, int destlen)
++{
++	switch (platform)
++	{
++	case TT_PLATFORM_APPLE_UNICODE:
++		switch (enctype)
++		{
++		case TT_APPLE_ID_DEFAULT:
++		case TT_APPLE_ID_UNICODE_2_0:
++			decode_unicode_BE(ctx, source, sourcelen, dest, destlen);
++			return;
++		}
++		fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : unsupported encoding (%d/%d)", platform, enctype);
++	case TT_PLATFORM_MACINTOSH:
++		switch (enctype)
++		{
++		case TT_MAC_ID_ROMAN:
++			if (sourcelen + 1 > destlen)
++				fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : overlong fontname: %s", source);
++			// TODO: Convert to UTF-8 from what encoding?
++			memcpy(dest, source, sourcelen);
++			dest[sourcelen] = 0;
++			return;
++		}
++		fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : unsupported encoding (%d/%d)", platform, enctype);
++	case TT_PLATFORM_MICROSOFT:
++		switch (enctype)
++		{
++		case TT_MS_ID_SYMBOL_CS:
++		case TT_MS_ID_UNICODE_CS:
++		case TT_MS_ID_UCS_4:
++			decode_unicode_BE(ctx, source, sourcelen, dest, destlen);
++			return;
++		}
++		fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : unsupported encoding (%d/%d)", platform, enctype);
++	default:
++		fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : unsupported encoding (%d/%d)", platform, enctype);
++	}
++}
++
++static void
++grow_system_font_list(fz_context *ctx, pdf_fontlistMS *fl)
++{
++	int newcap;
++	pdf_fontmapMS *newitems;
++
++	if (fl->cap == 0)
++		newcap = 1024;
++	else
++		newcap = fl->cap * 2;
++
++	// use realloc/free for the fontmap, since the list can
++	// remain in memory even with all fz_contexts destroyed
++	newitems = realloc(fl->fontmap, newcap * sizeof(pdf_fontmapMS));
++	if (!newitems)
++		fz_throw(ctx, FZ_ERROR_GENERIC, "OOM in grow_system_font_list");
++	memset(newitems + fl->cap, 0, sizeof(pdf_fontmapMS) * (newcap - fl->cap));
++
++	fl->fontmap = newitems;
++	fl->cap = newcap;
++}
++
++static void
++append_mapping(fz_context *ctx, pdf_fontlistMS *fl, const char *facename, const char *path, int index)
++{
++	if (fl->len == fl->cap)
++		grow_system_font_list(ctx, fl);
++
++	if (fl->len >= fl->cap)
++		fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : fontlist overflow");
++
++	fz_strlcpy(fl->fontmap[fl->len].fontface, facename, sizeof(fl->fontmap[0].fontface));
++	fz_strlcpy(fl->fontmap[fl->len].fontpath, path, sizeof(fl->fontmap[0].fontpath));
++	fl->fontmap[fl->len].index = index;
++
++	++fl->len;
++}
++
++static void
++safe_read(fz_context *ctx, fz_stream *file, int offset, char *buf, int size)
++{
++	int n;
++	fz_seek(ctx, file, offset, 0);
++	n = fz_read(ctx, file, (unsigned char *)buf, size);
++	if (n != size)
++		fz_throw(ctx, FZ_ERROR_GENERIC, "safe_read: read %d, expected %d", n, size);
++}
++
++static void
++read_ttf_string(fz_context *ctx, fz_stream *file, int offset, TT_NAME_RECORD *ttRecordBE, char *buf, int size)
++{
++	char szTemp[MAX_FACENAME * 2];
++	// ignore empty and overlong strings
++	int stringLength = BEtoHs(ttRecordBE->uStringLength);
++	if (stringLength == 0 || stringLength >= sizeof(szTemp))
++		return;
++
++	safe_read(ctx, file, offset + BEtoHs(ttRecordBE->uStringOffset), szTemp, stringLength);
++	decode_platform_string(ctx, BEtoHs(ttRecordBE->uPlatformID),
++		BEtoHs(ttRecordBE->uEncodingID), szTemp, stringLength, buf, size);
++}
++
++static void
++makeFakePSName(char szName[MAX_FACENAME], const char *szStyle)
++{
++	// append the font's subfamily, unless it's a Regular font
++	if (*szStyle && _stricmp(szStyle, "Regular") != 0)
++	{
++		fz_strlcat(szName, "-", MAX_FACENAME);
++		fz_strlcat(szName, szStyle, MAX_FACENAME);
++	}
++	remove_spaces(szName);
++}
++
++static void
++parseTTF(fz_context *ctx, fz_stream *file, int offset, int index, const char *path)
++{
++	TT_OFFSET_TABLE ttOffsetTableBE;
++	TT_TABLE_DIRECTORY tblDirBE;
++	TT_NAME_TABLE_HEADER ttNTHeaderBE;
++	TT_NAME_RECORD ttRecordBE;
++
++	char szPSName[MAX_FACENAME] = { 0 };
++	char szTTName[MAX_FACENAME] = { 0 };
++	char szStyle[MAX_FACENAME] = { 0 };
++	char szCJKName[MAX_FACENAME] = { 0 };
++	int i, count, tblOffset;
++
++	safe_read(ctx, file, offset, (char *)&ttOffsetTableBE, sizeof(TT_OFFSET_TABLE));
++
++	// check if this is a TrueType font of version 1.0 or an OpenType font
++	if (BEtoHl(ttOffsetTableBE.uVersion) != TTC_VERSION1 &&
++		BEtoHl(ttOffsetTableBE.uVersion) != TTAG_OTTO)
++	{
++		fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : invalid font version %x", (unsigned int)BEtoHl(ttOffsetTableBE.uVersion));
++	}
++
++	// determine the name table's offset by iterating through the offset table
++	count = BEtoHs(ttOffsetTableBE.uNumOfTables);
++	for (i = 0; i < count; i++)
++	{
++		int entryOffset = offset + sizeof(TT_OFFSET_TABLE) + i * sizeof(TT_TABLE_DIRECTORY);
++		safe_read(ctx, file, entryOffset, (char *)&tblDirBE, sizeof(TT_TABLE_DIRECTORY));
++		if (!BEtoHl(tblDirBE.uTag) || BEtoHl(tblDirBE.uTag) == TTAG_name)
++			break;
++	}
++	if (count == i || !BEtoHl(tblDirBE.uTag))
++		fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : nameless font");
++	tblOffset = BEtoHl(tblDirBE.uOffset);
++
++	// read the 'name' table for record count and offsets
++	safe_read(ctx, file, tblOffset, (char *)&ttNTHeaderBE, sizeof(TT_NAME_TABLE_HEADER));
++	offset = tblOffset + sizeof(TT_NAME_TABLE_HEADER);
++	tblOffset += BEtoHs(ttNTHeaderBE.uStorageOffset);
++
++	// read through the strings for PostScript name and font family
++	count = BEtoHs(ttNTHeaderBE.uNRCount);
++	for (i = 0; i < count; i++)
++	{
++		short langId, nameId;
++		BOOL isCJKName;
++
++		safe_read(ctx, file, offset + i * sizeof(TT_NAME_RECORD), (char *)&ttRecordBE, sizeof(TT_NAME_RECORD));
++
++		langId = BEtoHs(ttRecordBE.uLanguageID);
++		nameId = BEtoHs(ttRecordBE.uNameID);
++		isCJKName = TT_NAME_ID_FONT_FAMILY == nameId && LANG_CHINESE == PRIMARYLANGID(langId);
++
++		// ignore non-English strings (except for Chinese font names)
++		if (langId && langId != TT_MS_LANGID_ENGLISH_UNITED_STATES && !isCJKName)
++			continue;
++		// ignore names other than font (sub)family and PostScript name
++		fz_try(ctx)
++		{
++			if (isCJKName)
++				read_ttf_string(ctx, file, tblOffset, &ttRecordBE, szCJKName, sizeof(szCJKName));
++			else if (TT_NAME_ID_FONT_FAMILY == nameId)
++				read_ttf_string(ctx, file, tblOffset, &ttRecordBE, szTTName, sizeof(szTTName));
++			else if (TT_NAME_ID_FONT_SUBFAMILY == nameId)
++				read_ttf_string(ctx, file, tblOffset, &ttRecordBE, szStyle, sizeof(szStyle));
++			else if (TT_NAME_ID_PS_NAME == nameId)
++				read_ttf_string(ctx, file, tblOffset, &ttRecordBE, szPSName, sizeof(szPSName));
++		}
++		fz_catch(ctx)
++		{
++			fz_warn(ctx, "ignoring face name decoding fonterror");
++		}
++	}
++
++	// try to prevent non-Arial fonts from accidentally substituting Arial
++	if (!strcmp(szPSName, "ArialMT"))
++	{
++		// cf. https://code.google.com/p/sumatrapdf/issues/detail?id=2471
++		if (strcmp(szTTName, "Arial") != 0)
++			szPSName[0] = '\0';
++		// TODO: is there a better way to distinguish Arial Caps from Arial proper?
++		// cf. http://code.google.com/p/sumatrapdf/issues/detail?id=1290
++		else if (strstr(path, "caps") || strstr(path, "Caps"))
++			fz_throw(ctx, FZ_ERROR_GENERIC, "ignore %s, as it can't be distinguished from Arial,Regular", path);
++	}
++
++	if (szPSName[0])
++		append_mapping(ctx, &fontlistMS, szPSName, path, index);
++	if (szTTName[0])
++	{
++		// derive a PostScript-like name and add it, if it's different from the font's
++		// included PostScript name; cf. http://code.google.com/p/sumatrapdf/issues/detail?id=376
++		makeFakePSName(szTTName, szStyle);
++		// compare the two names before adding this one
++		if (lookup_compare(szTTName, szPSName))
++			append_mapping(ctx, &fontlistMS, szTTName, path, index);
++	}
++	if (szCJKName[0])
++	{
++		makeFakePSName(szCJKName, szStyle);
++		if (lookup_compare(szCJKName, szPSName) && lookup_compare(szCJKName, szTTName))
++			append_mapping(ctx, &fontlistMS, szCJKName, path, index);
++	}
++}
++
++static void
++parseTTFs(fz_context *ctx, const char *path)
++{
++	fz_stream *file = fz_open_file(ctx, path);
++	/* "fonterror : %s not found", path */
++	fz_try(ctx)
++	{
++		parseTTF(ctx, file, 0, 0, path);
++	}
++	fz_always(ctx)
++	{
++		fz_drop_stream(ctx,file);
++	}
++	fz_catch(ctx)
++	{
++		fz_rethrow(ctx);
++	}
++}
++
++static void
++parseTTCs(fz_context *ctx, const char *path)
++{
++	FONT_COLLECTION fontcollectionBE;
++	ULONG i, numFonts, *offsettableBE = NULL;
++
++	fz_stream *file = fz_open_file(ctx, path);
++	/* "fonterror : %s not found", path */
++
++	fz_var(offsettableBE);
++
++	fz_try(ctx)
++	{
++		safe_read(ctx, file, 0, (char *)&fontcollectionBE, sizeof(FONT_COLLECTION));
++		if (BEtoHl(fontcollectionBE.Tag) != TTAG_ttcf)
++			fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : wrong format %x", (unsigned int)BEtoHl(fontcollectionBE.Tag));
++		if (BEtoHl(fontcollectionBE.Version) != TTC_VERSION1 &&
++			BEtoHl(fontcollectionBE.Version) != TTC_VERSION2)
++		{
++			fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : invalid version %x", (unsigned int)BEtoHl(fontcollectionBE.Version));
++		}
++
++		numFonts = BEtoHl(fontcollectionBE.NumFonts);
++		offsettableBE = fz_malloc_array(ctx, numFonts, sizeof(ULONG));
++
++		safe_read(ctx, file, sizeof(FONT_COLLECTION), (char *)offsettableBE, numFonts * sizeof(ULONG));
++		for (i = 0; i < numFonts; i++)
++			parseTTF(ctx, file, BEtoHl(offsettableBE[i]), i, path);
++	}
++	fz_always(ctx)
++	{
++		fz_free(ctx, offsettableBE);
++		fz_drop_stream(ctx,file);
++	}
++	fz_catch(ctx)
++	{
++		fz_rethrow(ctx);
++	}
++}
++
++static void
++extend_system_font_list(fz_context *ctx, const WCHAR *path)
++{
++	WCHAR szPath[MAX_PATH], *lpFileName;
++	WIN32_FIND_DATA FileData;
++	HANDLE hList;
++
++	GetFullPathName(path, nelem(szPath), szPath, &lpFileName);
++
++	hList = FindFirstFile(szPath, &FileData);
++	if (hList == INVALID_HANDLE_VALUE)
++	{
++		// Don't complain about missing directories
++		if (GetLastError() == ERROR_FILE_NOT_FOUND)
++			return;
++		fz_throw(ctx, FZ_ERROR_GENERIC, "extend_system_font_list: unknown error %d", (int)GetLastError());
++	}
++	do
++	{
++		if (!(FileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY))
++		{
++			char szPathUtf8[MAX_PATH], *fileExt;
++			int res;
++			lstrcpyn(lpFileName, FileData.cFileName, szPath + MAX_PATH - lpFileName);
++			res = WideCharToMultiByte(CP_UTF8, 0, szPath, -1, szPathUtf8, sizeof(szPathUtf8), NULL, NULL);
++			if (!res)
++			{
++				fz_warn(ctx, "WideCharToMultiByte failed for %S", szPath);
++				continue;
++			}
++			fileExt = szPathUtf8 + strlen(szPathUtf8) - 4;
++			fz_try(ctx)
++			{
++				if (!_stricmp(fileExt, ".ttc"))
++					parseTTCs(ctx, szPathUtf8);
++				else if (!_stricmp(fileExt, ".ttf") || !_stricmp(fileExt, ".otf"))
++					parseTTFs(ctx, szPathUtf8);
++			}
++			fz_catch(ctx)
++			{
++				// ignore errors occurring while parsing a given font file
++			}
++		}
++	} while (FindNextFile(hList, &FileData));
++	FindClose(hList);
++}
++
++static void
++destroy_system_font_list(void)
++{
++	free(fontlistMS.fontmap);
++	memset(&fontlistMS, 0, sizeof(fontlistMS));
++}
++
++static void
++create_system_font_list(fz_context *ctx)
++{
++	WCHAR szFontDir[MAX_PATH];
++	UINT cch;
++
++	cch = GetWindowsDirectory(szFontDir, nelem(szFontDir) - 12);
++	if (0 < cch && cch < nelem(szFontDir) - 12)
++	{
++        /* willus.com edit--Win XP default MSVCRT.DLL doesn't have wcscat_s */
++#ifdef _WIN64
++		wcscat_s(szFontDir, MAX_PATH, L"\\Fonts\\*.?t?");
++#else
++		wcscat(szFontDir,L"\\Fonts\\*.?t?");
++#endif
++		extend_system_font_list(ctx, szFontDir);
++	}
++
++	if (fontlistMS.len == 0)
++		fz_warn(ctx, "couldn't find any usable system fonts");
++
++#ifdef NOCJKFONT
++	{
++		// If no CJK fallback font is builtin but one has been shipped separately (in the same
++		// directory as the main executable), add it to the list of loadable system fonts
++		WCHAR szFile[MAX_PATH], *lpFileName;
++		GetModuleFileName(0, szFontDir, MAX_PATH);
++		GetFullPathName(szFontDir, MAX_PATH, szFile, &lpFileName);
++		lstrcpyn(lpFileName, L"DroidSansFallback.ttf", szFile + MAX_PATH - lpFileName);
++		extend_system_font_list(ctx, szFile);
++	}
++#endif
++
++	// sort the font list, so that it can be searched binarily
++	qsort(fontlistMS.fontmap, fontlistMS.len, sizeof(pdf_fontmapMS), _stricmp);
++
++#ifdef DEBUG
++	// allow to overwrite system fonts for debugging purposes
++	// (either pass a full path or a search pattern such as "fonts\*.ttf")
++	cch = GetEnvironmentVariable(L"MUPDF_FONTS_PATTERN", szFontDir, nelem(szFontDir));
++	if (0 < cch && cch < nelem(szFontDir))
++	{
++		int i, prev_len = fontlistMS.len;
++		extend_system_font_list(ctx, szFontDir);
++		for (i = prev_len; i < fontlistMS.len; i++)
++		{
++			pdf_fontmapMS *entry = bsearch(fontlistMS.fontmap[i].fontface, fontlistMS.fontmap, prev_len, sizeof(pdf_fontmapMS), lookup_compare);
++			if (entry)
++				*entry = fontlistMS.fontmap[i];
++		}
++		qsort(fontlistMS.fontmap, fontlistMS.len, sizeof(pdf_fontmapMS), _stricmp);
++	}
++#endif
++
++	// make sure to clean up after ourselves
++	atexit(destroy_system_font_list);
++}
++
++static fz_font *
++pdf_load_windows_font_by_name(fz_context *ctx, const char *orig_name)
++{
++	pdf_fontmapMS *found = NULL;
++	char *comma, *fontname;
++	fz_font *font;
++
++    /* WILLUS MOD--not multi-threaded for k2pdfopt */
++	/* fz_synchronize_begin(); */
++	if (fontlistMS.len == 0)
++	{
++		fz_try(ctx)
++		{
++			create_system_font_list(ctx);
++		}
++		fz_catch(ctx) { }
++	}
++    /* WILLUS MOD--not multi-threaded for k2pdfopt */
++	/* fz_synchronize_end(); */
++	if (fontlistMS.len == 0)
++		fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror: couldn't find any fonts");
++
++	// work on a normalized copy of the font name
++	fontname = fz_strdup(ctx, orig_name);
++	remove_spaces(fontname);
++
++	// first, try to find the exact font name (including appended style information)
++	comma = strchr(fontname, ',');
++	if (comma)
++	{
++		*comma = '-';
++		found = pdf_find_windows_font_path(fontname);
++		*comma = ',';
++	}
++	// second, substitute the font name with a known PostScript name
++	else
++	{
++		int i;
++		for (i = 0; i < nelem(baseSubstitutes) && !found; i++)
++			if (!strcmp(fontname, baseSubstitutes[i].name))
++				found = pdf_find_windows_font_path(baseSubstitutes[i].pattern);
++	}
++	// third, search for the font name without additional style information
++	if (!found)
++		found = pdf_find_windows_font_path(fontname);
++	// fourth, try to separate style from basename for prestyled fonts (e.g. "ArialBold")
++	if (!found && !comma && (str_ends_with(fontname, "Bold") || str_ends_with(fontname, "Italic")))
++	{
++		int styleLen = str_ends_with(fontname, "Bold") ? 4 : str_ends_with(fontname, "BoldItalic") ? 10 : 6;
++		fontname = fz_resize_array(ctx, fontname, strlen(fontname) + 2, sizeof(char));
++		comma = fontname + strlen(fontname) - styleLen;
++		memmove(comma + 1, comma, styleLen + 1);
++		*comma = '-';
++		found = pdf_find_windows_font_path(fontname);
++		*comma = ',';
++		if (!found)
++			found = pdf_find_windows_font_path(fontname);
++	}
++	// fifth, try to convert the font name from the common Chinese codepage 936
++	if (!found && fontname[0] < 0)
++	{
++		WCHAR cjkNameW[MAX_FACENAME];
++		char cjkName[MAX_FACENAME];
++		if (MultiByteToWideChar(936, MB_ERR_INVALID_CHARS, fontname, -1, cjkNameW, nelem(cjkNameW)) &&
++			WideCharToMultiByte(CP_UTF8, 0, cjkNameW, -1, cjkName, nelem(cjkName), NULL, NULL))
++		{
++			comma = strchr(cjkName, ',');
++			if (comma)
++			{
++				*comma = '-';
++				found = pdf_find_windows_font_path(cjkName);
++				*comma = ',';
++			}
++			if (!found)
++				found = pdf_find_windows_font_path(cjkName);
++		}
++	}
++
++	fz_free(ctx, fontname);
++	if (!found)
++		fz_throw(ctx, FZ_ERROR_GENERIC, "couldn't find system font '%s'", orig_name);
++
++    /*
++	fz_warn(ctx, "loading non-embedded font '%s' from '%s'", orig_name, found->fontpath);
++    */
++
++	font = fz_new_font_from_file(ctx, orig_name, found->fontpath, found->index,
++		strcmp(found->fontface, "DroidSansFallback") != 0);
++    /* willus mod for MuPDF v1.10, 10-21-2016 */
++    {
++    fz_font_flags_t *flags;
++    flags=fz_font_flags(font);
++    if (flags!=NULL)
++    	flags->ft_substitute = 1;
++    }
++	return font;
++}
++
++static fz_font *
++pdf_load_windows_font(fz_context *ctx, const char *fontname, int bold, int italic, int needs_exact_metrics)
++{
++	if (needs_exact_metrics)
++	{
++		const char *clean_name;
++        /* WILLUS: Declare pdf_clean_base14_name() */
++        extern const char *pdf_clean_base14_name(const char *fontname);
++
++		/* TODO: the metrics for Times-Roman and Courier don't match
++		   those of Windows' Times New Roman and Courier New; for
++		   some reason, Poppler doesn't seem to have this problem */
++		int len;
++		if (fz_lookup_builtin_font(ctx,fontname, bold, italic, &len))
++			return NULL;
++
++		/* cf. http://code.google.com/p/sumatrapdf/issues/detail?id=2173 */
++		clean_name = pdf_clean_base14_name(fontname);
++		if (clean_name != fontname && !strncmp(clean_name, "Times-", 6))
++			return NULL;
++	}
++
++	// TODO: unset font->ft_substitute for base14/needs_exact_metrics?
++	return pdf_load_windows_font_by_name(ctx, fontname);
++}
++
++static const char *clean_font_name(const char *fontname)
++{
++	int i, k;
++	for (i = 0; i < nelem(base_font_names); i++)
++		for (k = 0; base_font_names[i][k]; k++)
++			if (!strcmp_ignore_space(base_font_names[i][k], fontname))
++				return base_font_names[i][0];
++	return fontname;
++}
++
++
++/* SumatraPDF: expose clean_font_name */
++static const char * pdf_clean_base14_name(const char *fontname)
++{
++	return clean_font_name(fontname);
++}
++
++static fz_font *
++pdf_load_windows_cjk_font(fz_context *ctx, const char *fontname, int ros, int serif)
++{
++	fz_font *font;
++
++    font=NULL; /* WILLUS: Avoid compiler warning */
++	/* try to find a matching system font before falling back to an approximate one */
++	fz_try(ctx)
++	{
++		font = pdf_load_windows_font_by_name(ctx, fontname);
++	}
++	fz_catch(ctx)
++	{
++		font = NULL;
++	}
++	if (font)
++		return font;
++
++	/* try to fall back to a reasonable system font */
++	fz_try(ctx)
++	{
++		if (serif)
++		{
++			switch (ros)
++			{
++			case FZ_ADOBE_CNS: font = pdf_load_windows_font_by_name(ctx, "MingLiU"); break;
++			case FZ_ADOBE_GB: font = pdf_load_windows_font_by_name(ctx, "SimSun"); break;
++			case FZ_ADOBE_JAPAN: font = pdf_load_windows_font_by_name(ctx, "MS-Mincho"); break;
++			case FZ_ADOBE_KOREA: font = pdf_load_windows_font_by_name(ctx, "Batang"); break;
++			default: fz_throw(ctx, FZ_ERROR_GENERIC, "invalid serif ros");
++			}
++		}
++		else
++		{
++			switch (ros)
++			{
++			case FZ_ADOBE_CNS: font = pdf_load_windows_font_by_name(ctx, "DFKaiShu-SB-Estd-BF"); break;
++			case FZ_ADOBE_GB:
++				fz_try(ctx)
++				{
++					font = pdf_load_windows_font_by_name(ctx, "KaiTi");
++				}
++				fz_catch(ctx)
++				{
++					font = pdf_load_windows_font_by_name(ctx, "KaiTi_GB2312");
++				}
++				break;
++			case FZ_ADOBE_JAPAN: font = pdf_load_windows_font_by_name(ctx, "MS-Gothic"); break;
++			case FZ_ADOBE_KOREA: font = pdf_load_windows_font_by_name(ctx, "Gulim"); break;
++			default: fz_throw(ctx, FZ_ERROR_GENERIC, "invalid sans-serif ros");
++			}
++		}
++	}
++	fz_catch(ctx)
++	{
++#ifdef NOCJKFONT
++		/* If no CJK fallback font is builtin, maybe one has been shipped separately */
++		font = pdf_load_windows_font_by_name(ctx, "DroidSansFallback");
++#else
++		fz_rethrow(ctx);
++#endif
++	}
++
++	return font;
++}
++
++#endif
++
++void pdf_install_load_system_font_funcs(fz_context *ctx)
++{
++#ifdef _WIN32
++	fz_install_load_system_font_funcs(ctx, pdf_load_windows_font, pdf_load_windows_cjk_font, NULL);
++#endif
++}
+diff --git a/source/fitz/font.c b/source/fitz/font.c
+index 733d91dae..69c46d968 100644
+--- a/source/fitz/font.c
++++ b/source/fitz/font.c
+@@ -5,8 +5,11 @@
+ #include "draw-imp.h"
+ 
+ #include <ft2build.h>
++/* willus mod -- remove hb includes */
++/*
+ #include "hb.h"
+ #include "hb-ft.h"
++*/
+ 
+ #include <assert.h>
+ 
+diff --git a/source/fitz/stext-device.c b/source/fitz/stext-device.c
+index 0ba944d44..3c05c51ac 100644
+--- a/source/fitz/stext-device.c
++++ b/source/fitz/stext-device.c
+@@ -692,6 +692,11 @@ fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options
+ 	dev->trm = fz_identity;
+ 	dev->lastchar = ' ';
+ 	dev->curdir = 1;
++    /* willus mod -- seems like this should be here, but not sure. */
++    if (opts)
++        dev->flags = opts->flags;
++    else
++        dev->flags = 0;
+ 
+ 	return (fz_device*)dev;
+ }
+diff --git a/source/fitz/string.c b/source/fitz/string.c
+index e70ae6e6e..b310463f4 100644
+--- a/source/fitz/string.c
++++ b/source/fitz/string.c
+@@ -448,6 +448,10 @@ fz_utflen(const char *s)
+ 
+ float fz_atof(const char *s)
+ {
++/* willus mod: atof(s), #if-#else-#endif */
++#if (!defined(__SSE__))
++    return(atof(s));
++#else
+ 	float result;
+ 
+ 	errno = 0;
+@@ -457,6 +461,7 @@ float fz_atof(const char *s)
+ 		return 1;
+ 	result = fz_clamp(result, -FLT_MAX, FLT_MAX);
+ 	return result;
++#endif
+ }
+ 
+ int fz_atoi(const char *s)
+diff --git a/source/pdf/pdf-annot.c b/source/pdf/pdf-annot.c
+index 68de8898a..5d43485bd 100644
+--- a/source/pdf/pdf-annot.c
++++ b/source/pdf/pdf-annot.c
+@@ -4,8 +4,20 @@
+ #include <string.h>
+ #include <time.h>
+ 
++/* willus mod--don't use _mkgmtime--not available in Win XP */
+ #ifdef _WIN32
+-#define timegm _mkgmtime
++static time_t timegm(struct tm *date);
++static time_t timegm(struct tm *date)
++
++    {
++    time_t t,z;
++    struct tm gmz;
++
++    z=(time_t)0;
++    gmz=(*gmtime(&z));
++    t=mktime(date)-mktime(&gmz);
++    return(t);
++    }
+ #endif
+ 
+ #define TEXT_ANNOT_SIZE (25.0f)
+diff --git a/source/pdf/pdf-link.c b/source/pdf/pdf-link.c
+index ae5beaa35..b5a52a000 100644
+--- a/source/pdf/pdf-link.c
++++ b/source/pdf/pdf-link.c
+@@ -351,6 +351,9 @@ pdf_resolve_link(fz_context *ctx, pdf_document *doc, const char *uri, float *xp,
+ 		}
+ 		return page;
+ 	}
++/* willus mod -- be quiet */
++/*
+ 	fz_warn(ctx, "unknown link uri '%s'", uri);
++*/
+ 	return -1;
+ }
+diff --git a/source/pdf/pdf-parse.c b/source/pdf/pdf-parse.c
+index 501c5626a..927ba6cd5 100644
+--- a/source/pdf/pdf-parse.c
++++ b/source/pdf/pdf-parse.c
+@@ -586,9 +586,14 @@ pdf_parse_ind_obj(fz_context *ctx, pdf_document *doc,
+ 			if (c == '\r')
+ 			{
+ 				c = fz_peek_byte(ctx, file);
++/* willus mod -- no warning */
++/*
+ 				if (c != '\n')
+ 					fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen);
+ 				else
++*/
++if (c=='\n')
++/* willus mod -- end */
+ 					fz_read_byte(ctx, file);
+ 			}
+ 			stm_ofs = fz_tell(ctx, file);
+diff --git a/source/pdf/pdf-xref.c b/source/pdf/pdf-xref.c
+index 2475b6e86..bc163563a 100644
+--- a/source/pdf/pdf-xref.c
++++ b/source/pdf/pdf-xref.c
+@@ -707,8 +707,11 @@ pdf_xref_size_from_old_trailer(fz_context *ctx, pdf_document *doc, pdf_lexbuf *b
+ 		if (!s)
+ 			fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection length missing");
+ 		len = fz_atoi(fz_strsep(&s, " "));
++/* willus mod -- no warning */
++/*
+ 		if (len < 0)
+ 			fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection length must be positive");
++*/
+ 
+ 		/* broken pdfs where the section is not on a separate line */
+ 		if (s && *s != '\0')
+@@ -1372,7 +1375,10 @@ pdf_init_document(fz_context *ctx, pdf_document *doc)
+ 	{
+ 		pdf_drop_xref_sections(ctx, doc);
+ 		fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
++/* willus mod -- be quiet */
++/*
+ 		fz_warn(ctx, "trying to repair broken xref");
++*/
+ 		repaired = 1;
+ 	}
+ 
+@@ -1496,7 +1502,10 @@ pdf_drop_document_imp(fz_context *ctx, pdf_document *doc)
+ 		/* Swallow error, but continue dropping */
+ 	}
+ 
++/* willu smod -- no pdf_drop_js */
++/*
+ 	pdf_drop_js(ctx, doc->js);
++*/
+ 
+ 	pdf_drop_xref_sections(ctx, doc);
+ 	fz_free(ctx, doc->xref_index);
+-- 
+2.22.0
+
diff --git a/pkgs/applications/misc/k2pdfopt/tesseract.patch b/pkgs/applications/misc/k2pdfopt/tesseract.patch
index b882f5b949c38..adfee9ae282f3 100644
--- a/pkgs/applications/misc/k2pdfopt/tesseract.patch
+++ b/pkgs/applications/misc/k2pdfopt/tesseract.patch
@@ -1,13 +1,675 @@
+From 39aa8502eee7bb669a29d1a9b3bfe5c9595ad960 Mon Sep 17 00:00:00 2001
+From: Daniel Fullmer <danielrf12@gmail.com>
+Date: Fri, 13 Sep 2019 13:45:05 -0400
+Subject: [PATCH] Willus mod changes from k2pdfopt
+
+---
+ src/api/Makefile.am        |   1 +
+ src/api/baseapi.cpp        |  87 +++++++++++
+ src/api/baseapi.h          |   3 +
+ src/api/tesscapi.cpp       | 311 +++++++++++++++++++++++++++++++++++++
+ src/api/tesseract.h        |  29 ++++
+ src/ccmain/tessedit.cpp    |   5 +-
+ src/ccutil/ccutil.h        |   7 +
+ src/ccutil/genericvector.h |  21 ++-
+ src/ccutil/mainblk.cpp     |  17 +-
+ src/ccutil/params.cpp      |   3 +-
+ src/ccutil/serialis.cpp    |   3 +
+ src/ccutil/serialis.h      |   2 +
+ src/lstm/input.cpp         |   3 +
+ 13 files changed, 488 insertions(+), 4 deletions(-)
+ create mode 100644 src/api/tesscapi.cpp
+ create mode 100644 src/api/tesseract.h
+
 diff --git a/src/api/Makefile.am b/src/api/Makefile.am
-index d8c1e54..46ead13 100644
+index d9b76eb6..cd2dc30f 100644
 --- a/src/api/Makefile.am
 +++ b/src/api/Makefile.am
-@@ -42,7 +42,7 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS)
- if VISIBILITY
- libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS
- endif
--libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp
-+libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp tesscapi.cpp
+@@ -39,6 +39,7 @@ libtesseract_api_la_SOURCES += lstmboxrenderer.cpp
+ libtesseract_api_la_SOURCES += pdfrenderer.cpp
+ libtesseract_api_la_SOURCES += wordstrboxrenderer.cpp
+ libtesseract_api_la_SOURCES += renderer.cpp
++libtesseract_api_la_SOURCES += tesscapi.cpp
  
  lib_LTLIBRARIES += libtesseract.la
- libtesseract_la_LDFLAGS = 
+ libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS) $(libarchive_LIBS)
+diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp
+index 9245d07c..ea964ee6 100644
+--- a/src/api/baseapi.cpp
++++ b/src/api/baseapi.cpp
+@@ -215,6 +215,14 @@ TessBaseAPI::TessBaseAPI()
+   // Use the current locale if building debug code.
+   std::locale::global(std::locale(""));
+ #endif
++  const char *locale;
++  locale = std::setlocale(LC_ALL, nullptr);
++/* willus mod Remove assertions--taken care of in tesscapi.cpp */
++//  ASSERT_HOST(!strcmp(locale, "C"));
++  locale = std::setlocale(LC_CTYPE, nullptr);
++//  ASSERT_HOST(!strcmp(locale, "C"));
++  locale = std::setlocale(LC_NUMERIC, nullptr);
++//  ASSERT_HOST(!strcmp(locale, "C"));
+ }
+ 
+ TessBaseAPI::~TessBaseAPI() {
+@@ -1333,6 +1341,85 @@ static void AddBoxToTSV(const PageIterator* it, PageIteratorLevel level,
+   text->add_str_int("\t", bottom - top);
+ }
+ 
++/* willus mod */
++int TessBaseAPI::GetOCRWords(int **x00,int **y00,int **x11,int **y11,int **ybaseline0,
++                             char **utf8words)
++
++    {
++    int iword,nwords,totlen,it8;
++    int *x0,*y0,*x1,*y1,*ybaseline;
++    char *tutf8;
++
++    ResultIterator *res_it = GetIterator();
++    /* Count words */
++    iword=0;
++    totlen=0;
++    while (!res_it->Empty(RIL_BLOCK))
++        {
++        if (res_it->Empty(RIL_WORD))
++            {
++            res_it->Next(RIL_WORD);
++            continue;
++            }
++        iword++;
++        STRING textstr=std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
++        totlen+=strlen(textstr.string())+1;
++        res_it->Next(RIL_WORD);
++        }
++    nwords=iword;
++/*
++printf("\nnwords=%d, totlen=%d\n",nwords,totlen);
++*/
++    x0=(*x00)=(int *)malloc(sizeof(int)*5*nwords);
++    y0=(*y00)=&x0[nwords];
++    x1=(*x11)=&y0[nwords];
++    y1=(*y11)=&x1[nwords];
++    ybaseline=(*ybaseline0)=&y1[nwords];
++    tutf8=(*utf8words)=(char *)malloc(totlen);
++    iword=0;
++    it8=0;
++    res_it->Begin();
++    while (!res_it->Empty(RIL_BLOCK))
++        {
++        if (res_it->Empty(RIL_WORD))
++            {
++            res_it->Next(RIL_WORD);
++            continue;
++            }
++        STRING textstr=std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
++        strcpy(&tutf8[it8],textstr.string());
++        it8 += strlen(&tutf8[it8])+1;
++        /*
++        STRING textstr("");
++        textstr += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
++        */
++/*
++printf("Word %d: '%s'\n",iword,textstr.string());
++*/
++        int left, top, right, bottom;
++        int u1,v1,u2,v2;
++        res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
++        res_it->Baseline(RIL_WORD, &u1, &v1, &u2, &v2);
++        x0[iword]=left;
++        x1[iword]=right;
++        y0[iword]=top;
++        y1[iword]=bottom;
++        ybaseline[iword]=(v1+v2)/2;
++        iword++;
++/*
++printf("BB: (%d,%d)-(%d,%d)  BL: (%d,%d)-(%d,%d)\n",left,bottom,right,top,x1,y1,x2,y2);
++*/
++        res_it->Next(RIL_WORD);
++        }
++/*
++printf("iword=%d\n",iword);
++*/
++    return(iword);
++    }
++
++/* willus mod */
++int GetOCRWords(int **x0,int **y0,int **x1,int **y1,int **ybaseline,char **utf8words);
++
+ /**
+  * Make a TSV-formatted string from the internal data structures.
+  * page_number is 0-based but will appear in the output as 1-based.
+diff --git a/src/api/baseapi.h b/src/api/baseapi.h
+index 3724dd92..23be5920 100644
+--- a/src/api/baseapi.h
++++ b/src/api/baseapi.h
+@@ -575,6 +575,9 @@ class TESS_API TessBaseAPI {
+    */
+   char* GetHOCRText(ETEXT_DESC* monitor, int page_number);
+ 
++/* willus mod */
++int GetOCRWords(int **x0,int **y0,int **x1,int **y1,int **ybaseline,char **utf8words);
++
+   /**
+    * Make a HTML-formatted string with hOCR markup from the internal
+    * data structures.
+diff --git a/src/api/tesscapi.cpp b/src/api/tesscapi.cpp
+new file mode 100644
+index 00000000..1752fafe
+--- /dev/null
++++ b/src/api/tesscapi.cpp
+@@ -0,0 +1,311 @@
++/*
++** tesscapi.cpp    willus.com attempt at C wrapper for tesseract.
++**                 (Butchered from tesseractmain.cpp)
++**                 Last udpated 9-1-12
++**
++** Copyright (C) 2012  http://willus.com
++**
++** This program is free software: you can redistribute it and/or modify
++** it under the terms of the GNU Affero General Public License as
++** published by the Free Software Foundation, either version 3 of the
++** License, or (at your option) any later version.
++**
++** This program is distributed in the hope that it will be useful,
++** but WITHOUT ANY WARRANTY; without even the implied warranty of
++** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++** GNU Affero General Public License for more details.
++**
++** You should have received a copy of the GNU Affero General Public License
++** along with this program.  If not, see <http://www.gnu.org/licenses/>.
++**
++*/
++
++/*
++#include "mfcpch.h"
++*/
++// #define USE_VLD //Uncomment for Visual Leak Detector.
++#if (defined _MSC_VER && defined USE_VLD)
++#include <vld.h>
++#endif
++
++// Include automatically generated configuration file if running autoconf
++#ifdef HAVE_CONFIG_H
++#include "config_auto.h"
++#endif
++#include <locale.h>
++#ifdef USING_GETTEXT
++#include <libintl.h>
++#define _(x) gettext(x)
++#else
++#define _(x) (x)
++#endif
++
++#include "allheaders.h"
++#include "baseapi.h"
++#include "strngs.h"
++#include "params.h"
++#include "blobs.h"
++#include "simddetect.h"
++#include "tesseractclass.h"
++/*
++#include "notdll.h"
++*/
++
++/* C Wrappers */
++#include "tesseract.h"
++
++// static tesseract::TessBaseAPI api[4];
++
++/*
++** ocr_type=0:  OEM_DEFAULT
++** ocr_type=1:  OEM_TESSERACT_ONLY
++** ocr_type=2:  OEM_LSTM_ONLY
++** ocr_type=3:  OEM_TESSERACT_LSTM_COMBINED
++*/
++void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out,
++                     char *initstr,int maxlen,int *status)
++
++    {
++    char original_locale[256];
++    tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI;
++/*
++printf("@tess_capi_init\n");
++printf("    datapath='%s'\n",datapath);
++printf("    language='%s'\n",language);
++printf("    ocr_type=%d\n",ocr_type);
++*/
++#ifdef USE_NLS
++    setlocale (LC_ALL, "");
++    bindtextdomain (PACKAGE, LOCALEDIR);
++    textdomain (PACKAGE);
++#endif
++    /* willus mod, 11-24-16 */
++    /* Tesseract needs "C" locale to correctly parse all data .traineddata files. */
++/*
++printf("locale='%s'\n",setlocale(LC_ALL,NULL));
++printf("ctype='%s'\n",setlocale(LC_CTYPE,NULL));
++printf("numeric='%s'\n",setlocale(LC_NUMERIC,NULL));
++*/
++    strncpy(original_locale,setlocale(LC_ALL,NULL),255);
++    original_locale[255]='\0';
++/*
++printf("original_locale='%s'\n",original_locale);
++*/
++    setlocale(LC_ALL,"C");
++/*
++printf("new locale='%s'\n",setlocale(LC_ALL,NULL));
++printf("new ctype='%s'\n",setlocale(LC_CTYPE,NULL));
++printf("new numeric='%s'\n",setlocale(LC_NUMERIC,NULL));
++*/
++    // fprintf(stderr, "tesseract %s\n", tesseract::TessBaseAPI::Version());
++    // Make the order of args a bit more forgiving than it used to be.
++    const char* lang = "eng";
++    tesseract::PageSegMode pagesegmode = tesseract::PSM_SINGLE_BLOCK;
++    if (language!=NULL && language[0]!='\0')
++        lang = language;
++    /*
++    if (output == NULL)
++        {
++        fprintf(stderr, _("Usage:%s imagename outputbase [-l lang] "
++                      "[-psm pagesegmode] [configfile...]\n"), argv[0]);
++        fprintf(stderr,
++            _("pagesegmode values are:\n"
++              "0 = Orientation and script detection (OSD) only.\n"
++              "1 = Automatic page segmentation with OSD.\n"
++              "2 = Automatic page segmentation, but no OSD, or OCR\n"
++              "3 = Fully automatic page segmentation, but no OSD. (Default)\n"
++              "4 = Assume a single column of text of variable sizes.\n"
++              "5 = Assume a single uniform block of vertically aligned text.\n"
++              "6 = Assume a single uniform block of text.\n"
++              "7 = Treat the image as a single text line.\n"
++              "8 = Treat the image as a single word.\n"
++              "9 = Treat the image as a single word in a circle.\n"
++              "10 = Treat the image as a single character.\n"));
++        fprintf(stderr, _("-l lang and/or -psm pagesegmode must occur before any"
++                      "configfile.\n"));
++        exit(1);
++        }
++    */
++/*
++printf("SSE = %s\n",SIMDDetect::IsSSEAvailable() ? "AVAILABLE" : "NOT AVAILABLE");
++printf("AVX = %s\n",SIMDDetect::IsAVXAvailable() ? "AVAILABLE" : "NOT AVAILABLE");
++*/
++/*
++v4.00 loads either TESSERACT enginer, LSTM engine, or both.  No CUBE.
++*/
++    ocr_type=0; /* Ignore specified and use default */
++    api->SetOutputName(NULL);
++    (*status)=api->Init(datapath,lang,
++              ocr_type==0 ? tesseract::OEM_DEFAULT :
++                (ocr_type==1 ? tesseract::OEM_TESSERACT_ONLY :
++                   (ocr_type==2 ? tesseract::OEM_LSTM_ONLY :
++                                  (tesseract::OEM_TESSERACT_LSTM_COMBINED))));
++    if ((*status)!=0)
++        {
++        /* willus mod, 11-24-16 */
++        setlocale(LC_ALL,original_locale);
++        api->End();
++        delete api;
++        return(NULL);
++        }
++    /*
++    api.Init("tesscapi",lang,tesseract::OEM_DEFAULT,
++           &(argv[arg]), argc - arg, NULL, NULL, false);
++    */
++    // We have 2 possible sources of pagesegmode: a config file and
++    // the command line. For backwards compatability reasons, the
++    // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
++    // default for this program is tesseract::PSM_AUTO. We will let
++    // the config file take priority, so the command-line default
++    // can take priority over the tesseract default, so we use the
++    // value from the command line only if the retrieved mode
++    // is still tesseract::PSM_SINGLE_BLOCK, indicating no change
++    // in any config file. Therefore the only way to force
++    // tesseract::PSM_SINGLE_BLOCK is from the command line.
++    // It would be simpler if we could set the value before Init,
++    // but that doesn't work.
++    if (api->GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK)
++        api->SetPageSegMode(pagesegmode);
++
++    /*
++    ** Initialization message
++    */
++    {
++    char istr[1024];
++    int sse,avx;
++
++// printf("tessedit_ocr_engine_mode = %d\n",tessedit_ocr_engine_mode);
++    sprintf(istr,"%s",api->Version());
++    sse=tesseract::SIMDDetect::IsSSEAvailable();
++    avx=tesseract::SIMDDetect::IsAVXAvailable();
++    if (sse || avx)
++        sprintf(&istr[strlen(istr)]," [%s]",sse&&avx?"SSE+AVX":(sse?"SSE":"AVX"));
++    sprintf(&istr[strlen(istr)],"\n    Tesseract data folder = '%s'",datapath==NULL?getenv("TESSDATA_PREFIX"):datapath);
++    strcat(istr,"\n    Tesseract languages: ");
++    GenericVector<STRING> languages;
++    api->GetLoadedLanguagesAsVector(&languages);
++/*
++printf("OEM=%d\n",api->oem());
++printf("Langs='%s'\n",api->GetInitLanguagesAsString());
++printf("AnyTessLang()=%d\n",(int)api->tesseract()->AnyTessLang());
++printf("AnyLSTMLang()=%d\n",(int)api->tesseract()->AnyLSTMLang());
++printf("num_sub_langs()=%d\n",api->tesseract()->num_sub_langs());
++printf("languages.size()=%d\n",(int)languages.size());
++*/
++
++    for (int i=0;i<=api->tesseract()->num_sub_langs();i++)
++        {
++        tesseract::Tesseract *lang1;
++        int eng;
++        lang1 = i==0 ? api->tesseract() : api->tesseract()->get_sub_lang(i-1);
++        eng=(int)lang1->tessedit_ocr_engine_mode;
++        sprintf(&istr[strlen(istr)],"%s%s [%s]",i==0?"":", ",lang1->lang.string(),
++                 eng==2?"LSTM+Tess":(eng==1?"LSTM":"Tess"));
++        }
++/*
++printf("%d. '%s'\n",i+1,languages[i].string());
++printf("    sublang[%d].oem_engine = %d\n",i+1,(int)api->tesseract()->get_sub_lang(i)->tessedit_ocr_engine_mode);
++*/
++
++    /*
++    if (ocr_type==0 || ocr_type==3)
++        sprintf(&istr[strlen(istr)],"[LSTM+] (lang=");
++    else if (ocr_type==2)
++        sprintf(&istr[strlen(istr)],"[LSTM] (lang=");
++    strncpy(&istr[strlen(istr)],language,253-strlen(istr));
++    istr[253]='\0';
++    strcat(istr,")");
++    */
++    if (out!=NULL)
++        fprintf(out,"%s\n",istr);
++    if (initstr!=NULL)
++        {
++        strncpy(initstr,istr,maxlen-1);
++        initstr[maxlen-1]='\0';
++        }
++    }
++
++
++    /* Turn off LSTM debugging output */
++    api->SetVariable("lstm_debug_level","0");
++#if (WILLUSDEBUG & 1)
++    api->SetVariable("lstm_debug_level","9");
++    api->SetVariable("paragraph_debug_level","9");
++    api->SetVariable("tessdata_manager_debug_level","9");
++    api->SetVariable("tosp_debug_level","9");
++    api->SetVariable("wordrec_debug_level","9");
++    api->SetVariable("segsearch_debug_level","9");
++#endif
++    /* willus mod, 11-24-16 */
++    setlocale(LC_ALL,original_locale);
++    return((void *)api);
++    }
++
++
++int tess_capi_get_ocr(void *vapi,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out)
++
++    {
++    tesseract::TessBaseAPI *api;
++    static int old_segmode=-1;
++
++    api=(tesseract::TessBaseAPI *)vapi;
++    if (old_segmode != segmode)
++        {
++        old_segmode=segmode;
++        api->SetPageSegMode((tesseract::PageSegMode)segmode);
++        }
++    if (!api->ProcessPage(pix,0,NULL,NULL,0,NULL))
++        {
++        /* pixDestroy(&pix); */
++        if (out!=NULL)
++            fprintf(out,"tesscapi:  Error during bitmap processing.\n");
++        api->Clear();
++        return(-1);
++        }
++    strncpy(outstr,api->GetUTF8Text(),maxlen-1);
++    outstr[maxlen-1]='\0';
++    api->Clear();
++    return(0);
++    }
++
++
++int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode,
++                                int **left,int **top,int **right,int **bottom,
++                                int **ybase,char **text,int *nw,
++                                FILE *out)
++
++    {
++    tesseract::TessBaseAPI *api;
++    static int old_segmode=-1;
++
++    api=(tesseract::TessBaseAPI *)vapi;
++    if (old_segmode != segmode)
++        {
++        old_segmode=segmode;
++        api->SetPageSegMode((tesseract::PageSegMode)segmode);
++        }
++    if (!api->ProcessPage(pix,0,NULL,NULL,0,NULL))
++        {
++        if (out!=NULL)
++            fprintf(out,"tesscapi:  Error during bitmap processing.\n");
++        api->Clear();
++        (*nw)=0;
++        return(-1);
++        }
++    (*nw)=api->GetOCRWords(left,top,right,bottom,ybase,text);
++    api->Clear();
++    return(0);
++    }
++
++
++void tess_capi_end(void *vapi)
++
++    {
++    tesseract::TessBaseAPI *api;
++
++    if (vapi==NULL)
++        return;
++    api=(tesseract::TessBaseAPI *)vapi;
++    api->End();
++    delete api;
++    }
+diff --git a/src/api/tesseract.h b/src/api/tesseract.h
+new file mode 100644
+index 00000000..575948cc
+--- /dev/null
++++ b/src/api/tesseract.h
+@@ -0,0 +1,29 @@
++/*
++** Willus.com's Tesseract C Wrappers
++**
++** 6-8-12
++**
++*/
++
++#ifndef           _TESSERACT_H_
++#define           _TESSERACT_H_
++
++//#include <leptonica.h>
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out,
++                    char *initstr,int maxlen,int *status);
++int tess_capi_get_ocr(void *api,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out);
++int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode,
++                                int **left,int **top,int **right,int **bottom,
++                                int **ybase,char **text,int *nw,
++                                FILE *out);
++void tess_capi_end(void *api);
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif
+diff --git a/src/ccmain/tessedit.cpp b/src/ccmain/tessedit.cpp
+index 17f0951b..7af94ee2 100644
+--- a/src/ccmain/tessedit.cpp
++++ b/src/ccmain/tessedit.cpp
+@@ -101,6 +101,10 @@ bool Tesseract::init_tesseract_lang_data(
+         " to your \"tessdata\" directory.\n");
+     return false;
+   }
++  /* willus mod */
++  TFile fp;
++  strncpy(fp.tfile_filename,tessdata_path.string(),511);
++  fp.tfile_filename[511]='\0';
+ #ifndef DISABLED_LEGACY_ENGINE
+   if (oem == OEM_DEFAULT) {
+     // Set the engine mode from availability, which can then be overridden by
+@@ -116,7 +120,6 @@ bool Tesseract::init_tesseract_lang_data(
+ #endif  // ndef DISABLED_LEGACY_ENGINE
+ 
+   // If a language specific config file (lang.config) exists, load it in.
+-  TFile fp;
+   if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
+     ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp,
+                                  this->params());
+diff --git a/src/ccutil/ccutil.h b/src/ccutil/ccutil.h
+index 71e89c60..bdeccc14 100644
+--- a/src/ccutil/ccutil.h
++++ b/src/ccutil/ccutil.h
+@@ -80,6 +80,13 @@ class CCUtil {
+   // Member parameters.
+   // These have to be declared and initialized after params_ member, since
+   // params_ should be initialized before parameters are added to it.
++/* willus mod */
++/*
++  #ifdef _WIN32
++  STRING_VAR_H(tessedit_module_name, WINDLLNAME,
++               "Module colocated with tessdata dir");
++  #endif
++*/
+   INT_VAR_H(ambigs_debug_level, 0, "Debug level for unichar ambiguities");
+   BOOL_VAR_H(use_definite_ambigs_for_classifier, false,
+              "Use definite ambiguities when running character classifier");
+diff --git a/src/ccutil/genericvector.h b/src/ccutil/genericvector.h
+index 3556d153..3a5e8662 100644
+--- a/src/ccutil/genericvector.h
++++ b/src/ccutil/genericvector.h
+@@ -382,7 +382,26 @@ inline bool LoadDataFromFile(const char* filename, GenericVector<char>* data) {
+       // reserve an extra byte in case caller wants to append a '\0' character
+       data->reserve(size + 1);
+       data->resize_no_init(size);
+-      result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;
++    /* willus mod Dec 2018--weird issue with Win XP and MinGW gcc 7.3.0 */
++    /* Can't read entire file at once -- need to break up into smaller blocksize reads */
++    {
++    int frs,n;
++    int blocksize;
++    blocksize=1024*1024;
++    for (n=0;1;)
++        {
++        int bs;
++        bs= size-n > blocksize ? blocksize : size-n;
++        frs=(int)fread(&(*data)[n],1,bs,fp);
++        n+=frs;
++        if (frs<bs || bs<blocksize || n>=size)
++            break;
++        }
++    result = static_cast<long>((long)n==size);
++    }
++    /*
++    result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;
++    */
+     }
+     fclose(fp);
+   }
+diff --git a/src/ccutil/mainblk.cpp b/src/ccutil/mainblk.cpp
+index 52b04b04..80b26044 100644
+--- a/src/ccutil/mainblk.cpp
++++ b/src/ccutil/mainblk.cpp
+@@ -55,8 +55,22 @@ void CCUtil::main_setup(const char *argv0, const char *basename) {
+ #if defined(_WIN32)
+   } else if (datadir == nullptr || _access(datadir.string(), 0) != 0) {
+     /* Look for tessdata in directory of executable. */
++    /*
++    char drive[_MAX_DRIVE];
++    char dir[_MAX_DIR];
++    */
+     char path[_MAX_PATH];
+-    DWORD length = GetModuleFileName(nullptr, path, sizeof(path));
++    int i;
++    /* DWORD length = */ GetModuleFileName(nullptr, path, sizeof(path));
++    /* willus mod--avoid _splitpath_s -- not in XP */
++    for (i=strlen(path)-1;i>=0 && path[i]!='/' && path[i]!='\\';i--);
++    if (i>=0)
++        {
++        path[i]='\0';
++        datadir=path;
++        datadir += "/tessdata";
++        }
++    /*
+     if (length > 0 && length < sizeof(path)) {
+       char* separator = std::strrchr(path, '\\');
+       if (separator != nullptr) {
+@@ -65,6 +79,7 @@ void CCUtil::main_setup(const char *argv0, const char *basename) {
+         datadir += "/tessdata";
+       }
+     }
++    */
+ #endif /* _WIN32 */
+ #if defined(TESSDATA_PREFIX)
+   } else {
+diff --git a/src/ccutil/params.cpp b/src/ccutil/params.cpp
+index 00bf2563..486c5ce0 100644
+--- a/src/ccutil/params.cpp
++++ b/src/ccutil/params.cpp
+@@ -82,7 +82,8 @@ bool ParamUtils::ReadParamsFromFp(SetParamConstraint constraint, TFile *fp,
+ 
+       if (!foundit) {
+         anyerr = true;         // had an error
+-        tprintf("Warning: Parameter not found: %s\n", line);
++        /* willus mod */
++        tprintf("Tesseract warning: Parameter %s not found in file %s.\n",line,fp->tfile_filename);
+       }
+     }
+   }
+diff --git a/src/ccutil/serialis.cpp b/src/ccutil/serialis.cpp
+index 7def011f..6107a494 100644
+--- a/src/ccutil/serialis.cpp
++++ b/src/ccutil/serialis.cpp
+@@ -201,6 +201,9 @@ bool TFile::Open(const STRING& filename, FileReader reader) {
+   offset_ = 0;
+   is_writing_ = false;
+   swap_ = false;
++  /* willus mod */
++  strncpy(tfile_filename,filename.string(),511);
++  tfile_filename[511]='\0';
+   if (reader == nullptr)
+     return LoadDataFromFile(filename, data_);
+   else
+diff --git a/src/ccutil/serialis.h b/src/ccutil/serialis.h
+index 095b9227..4cc8251e 100644
+--- a/src/ccutil/serialis.h
++++ b/src/ccutil/serialis.h
+@@ -77,6 +77,8 @@ class TFile {
+  public:
+   TFile();
+   ~TFile();
++  /* willus mod */
++  char tfile_filename[512];
+ 
+   // All the Open methods load the whole file into memory for reading.
+   // Opens a file with a supplied reader, or nullptr to use the default.
+diff --git a/src/lstm/input.cpp b/src/lstm/input.cpp
+index 73b584b3..0b0b54c3 100644
+--- a/src/lstm/input.cpp
++++ b/src/lstm/input.cpp
+@@ -93,8 +93,11 @@ Pix* Input::PrepareLSTMInputs(const ImageData& image_data,
+     return nullptr;
+   }
+   if (width < min_width || height < min_width) {
++    /* willus mod -- no warning */
++    /*
+     tprintf("Image too small to scale!! (%dx%d vs min width of %d)\n", width,
+             height, min_width);
++    */
+     pixDestroy(&pix);
+     return nullptr;
+   }
+-- 
+2.22.0
+