Stirling-Tools · thomas-mc-work · Feb 20, 2024 · Mar 5, 2024
@@ -23,9 +23,7 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et
         python3 && \
     wget https://bootstrap.pypa.io/get-pip.py -qO - | python3 - --break-system-packages --no-cache-dir --upgrade && \
 # uno unoconv and HTML
-    pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \
-    mv /usr/share/tessdata /usr/share/tessdata-original
-
+    pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint
 
 
 ARG VERSION_TAG
@@ -63,6 +61,9 @@ RUN mkdir -p  /configs /logs /customFiles /pipeline/watchedFolders /pipeline/fin
 
 EXPOSE 8080
 
+# location for the additional tesseract OCR language files
+VOLUME /languages
+
 # Set user and run command
 ##USER stirlingpdfuser
 ENTRYPOINT ["tini", "--", "/scripts/init.sh"]

@@ -3,9 +3,11 @@
 This document provides instructions on how to add additional language packs for the OCR tab in Stirling-PDF, both inside and outside of Docker.
 
 ## My OCR used to work and now doesn't!
-The paths have changed for the tessadata locations on new docker images, please use ``/usr/share/tessdata`` (Others should still work for backwards compatability but might not)
+
+The paths have changed for the tessadata locations on new docker images, please use ``/languages`` (Others should still work for backwards compatability but might not)
 
 ## How does the OCR Work
+
 Stirling-PDF uses [OCRmyPDF](https://github.com/ocrmypdf/OCRmyPDF) which in turn uses tesseract for its text recognition.
 All credit goes to them for this awesome work!
 
@@ -21,14 +23,13 @@ Depending on your requirements, you can choose the appropriate language pack for
 ### Installing Language Packs
 
 1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need.
-2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tessdata`
-
-# DO NOT REMOVE EXISTING ENG.TRAINEDDATA, IT'S REQUIRED.
+2. Place the `*.traineddata` files in the Tesseract tessdata directory: `/languages`
 
 #### Docker
 
 If you are using Docker, you need to expose the Tesseract tessdata directory as a volume in order to use the additional language packs.
 #### Docker Compose
+
 Modify your `docker-compose.yml` file to include the following volume configuration:
 
 
@@ -37,17 +38,18 @@ services:
   your_service_name:
     image: your_docker_image_name
     volumes:
-      - /location/of/trainingData:/usr/share/tessdata
+      - /location/of/trainingData:/languages:ro
 ```
 
-
 #### Docker run
+
 Add the following to your existing docker run command
 ```bash
--v /location/of/trainingData:/usr/share/tessdata
+-v /location/of/trainingData:/languages:ro
 ```
 
 #### Non-Docker
+
 If you are not using Docker, you need to install the OCR components, including the ocrmypdf app.
 You can see [OCRmyPDF install guide](https://ocrmypdf.readthedocs.io/en/latest/installation.html)
 

@@ -107,10 +107,10 @@ For people that don't mind about space optimization just use the latest tag.
 ![Docker Image Size (tag)](https://img.shields.io/docker/image-size/frooodle/s-pdf/latest-ultra-lite?label=Stirling-PDF%20Ultra-Lite)
 
 Docker Run
-```bash
+```sh
 docker run -d \
   -p 8080:8080 \
-  -v /location/of/trainingData:/usr/share/tessdata \
+  -v /location/of/trainingData:/languages:ro \
   -v /location/of/extraConfigs:/configs \
   -v /location/of/logs:/logs \
   -e DOCKER_ENABLE_SECURITY=false \
@@ -122,16 +122,19 @@ docker run -d \
 
   -v /location/of/customFiles:/customFiles \
 ```
+
 Docker Compose
+
 ```yaml
 version: '3.3'
+
 services:
   stirling-pdf:
     image: frooodle/s-pdf:latest
     ports:
       - '8080:8080'
     volumes:
-      - /location/of/trainingData:/usr/share/tessdata #Required for extra OCR languages
+      - /location/of/trainingData:/languages:ro # Required only for extra OCR languages
       - /location/of/extraConfigs:/configs
 #      - /location/of/customFiles:/customFiles/
 #      - /location/of/logs:/logs/
@@ -142,6 +145,7 @@ services:
 Note: Podman is CLI-compatible with Docker, so simply replace "docker" with "podman".
 
 ## Enable OCR/Compression feature
+
 Please view https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToUseOCR.md
 
 ## Supported Languages

@@ -1,27 +1,15 @@
 #!/bin/bash
 
-# Copy the original tesseract-ocr files to the volume directory without overwriting existing files
-echo "Copying original files without overwriting existing files"
-mkdir -p /usr/share/tessdata
-cp -rn /usr/share/tessdata-original/* /usr/share/tessdata
-
-if [ -d /usr/share/tesseract-ocr/4.00/tessdata ]; then
-        cp -r /usr/share/tesseract-ocr/4.00/tessdata/* /usr/share/tessdata || true;
-fi
-
-if [ -d /usr/share/tesseract-ocr/5/tessdata ]; then
-        cp -r /usr/share/tesseract-ocr/5/tessdata/* /usr/share/tessdata || true;
-fi
+# Add custom language files via soft links to prevent loosing required existing files
+for lang_file in /languages/*.traineddata; do
+  ln -sf "$lang_file" /usr/share/tessdata/
+done
 
 # Check if TESSERACT_LANGS environment variable is set and is not empty
 if [[ -n "$TESSERACT_LANGS" ]]; then
-  # Convert comma-separated values to a space-separated list
-  LANGS=$(echo $TESSERACT_LANGS | tr ',' ' ')
-
-  # Install each language pack
-  for LANG in $LANGS; do
-    apt-get install -y "tesseract-ocr-$LANG"
-  done
+  # Convert comma-separated values to space separated list of "tesseract-ocr-$value"
+  PKGS=$(echo ,$TESSERACT_LANGS | sed 's/,/ tesseract-ocr-/g')
+  apt-get install -y "$PKGS"
 fi
 
 /scripts/download-security-jar.sh