From 4dd903b473e914b1f8814220dff50fe1f2baf480 Mon Sep 17 00:00:00 2001 From: Michael Date: Sun, 14 Jan 2024 19:21:08 +0000 Subject: [PATCH 1/2] New addon "tesseract" for OCR --- tesseract/composer.json | 5 + tesseract/composer.lock | 66 +++ tesseract/tesseract.php | 33 ++ tesseract/vendor/autoload.php | 7 + tesseract/vendor/composer/ClassLoader.php | 445 +++++++++++++++ tesseract/vendor/composer/LICENSE | 21 + .../vendor/composer/autoload_classmap.php | 9 + .../vendor/composer/autoload_namespaces.php | 9 + tesseract/vendor/composer/autoload_psr4.php | 10 + tesseract/vendor/composer/autoload_real.php | 55 ++ tesseract/vendor/composer/autoload_static.php | 31 ++ tesseract/vendor/composer/installed.json | 48 ++ .../thiagoalessio/tesseract_ocr/.appveyor.yml | 14 + .../thiagoalessio/tesseract_ocr/MIT-LICENSE | 19 + .../thiagoalessio/tesseract_ocr/README.md | 508 ++++++++++++++++++ .../thiagoalessio/tesseract_ocr/codecov.yml | 4 + .../thiagoalessio/tesseract_ocr/composer.json | 35 ++ .../tesseract_ocr/src/Command.php | 80 +++ .../src/FeatureNotAvailableException.php | 7 + .../tesseract_ocr/src/FriendlyErrors.php | 120 +++++ .../src/ImageNotFoundException.php | 7 + .../src/NoWritePermissionsForOutputFile.php | 7 + .../tesseract_ocr/src/Option.php | 79 +++ .../tesseract_ocr/src/Process.php | 83 +++ .../src/TesseractNotFoundException.php | 7 + .../tesseract_ocr/src/TesseractOCR.php | 181 +++++++ .../src/TesseractOcrException.php | 7 + .../src/UnsuccessfulCommandException.php | 7 + 28 files changed, 1904 insertions(+) create mode 100644 tesseract/composer.json create mode 100644 tesseract/composer.lock create mode 100644 tesseract/tesseract.php create mode 100644 tesseract/vendor/autoload.php create mode 100644 tesseract/vendor/composer/ClassLoader.php create mode 100644 tesseract/vendor/composer/LICENSE create mode 100644 tesseract/vendor/composer/autoload_classmap.php create mode 100644 tesseract/vendor/composer/autoload_namespaces.php create mode 100644 tesseract/vendor/composer/autoload_psr4.php create mode 100644 tesseract/vendor/composer/autoload_real.php create mode 100644 tesseract/vendor/composer/autoload_static.php create mode 100644 tesseract/vendor/composer/installed.json create mode 100644 tesseract/vendor/thiagoalessio/tesseract_ocr/.appveyor.yml create mode 100644 tesseract/vendor/thiagoalessio/tesseract_ocr/MIT-LICENSE create mode 100644 tesseract/vendor/thiagoalessio/tesseract_ocr/README.md create mode 100644 tesseract/vendor/thiagoalessio/tesseract_ocr/codecov.yml create mode 100644 tesseract/vendor/thiagoalessio/tesseract_ocr/composer.json create mode 100644 tesseract/vendor/thiagoalessio/tesseract_ocr/src/Command.php create mode 100644 tesseract/vendor/thiagoalessio/tesseract_ocr/src/FeatureNotAvailableException.php create mode 100644 tesseract/vendor/thiagoalessio/tesseract_ocr/src/FriendlyErrors.php create mode 100644 tesseract/vendor/thiagoalessio/tesseract_ocr/src/ImageNotFoundException.php create mode 100644 tesseract/vendor/thiagoalessio/tesseract_ocr/src/NoWritePermissionsForOutputFile.php create mode 100644 tesseract/vendor/thiagoalessio/tesseract_ocr/src/Option.php create mode 100644 tesseract/vendor/thiagoalessio/tesseract_ocr/src/Process.php create mode 100644 tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractNotFoundException.php create mode 100644 tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractOCR.php create mode 100644 tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractOcrException.php create mode 100644 tesseract/vendor/thiagoalessio/tesseract_ocr/src/UnsuccessfulCommandException.php diff --git a/tesseract/composer.json b/tesseract/composer.json new file mode 100644 index 00000000..2a0937c0 --- /dev/null +++ b/tesseract/composer.json @@ -0,0 +1,5 @@ +{ + "require": { + "thiagoalessio/tesseract_ocr": "^2.13" + } +} diff --git a/tesseract/composer.lock b/tesseract/composer.lock new file mode 100644 index 00000000..036868e7 --- /dev/null +++ b/tesseract/composer.lock @@ -0,0 +1,66 @@ +{ + "_readme": [ + "This file locks the dependencies of your project to a known state", + "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", + "This file is @generated automatically" + ], + "content-hash": "778b5479cb5d2b31b57f40473a87f8eb", + "packages": [ + { + "name": "thiagoalessio/tesseract_ocr", + "version": "2.13.0", + "source": { + "type": "git", + "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git", + "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1", + "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1", + "shasum": "" + }, + "require": { + "php": "^5.3 || ^7.0 || ^8.0" + }, + "require-dev": { + "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "thiagoalessio\\TesseractOCR\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "thiagoalessio", + "email": "thiagoalessio@me.com" + } + ], + "description": "A wrapper to work with Tesseract OCR inside PHP.", + "keywords": [ + "OCR", + "Tesseract", + "text recognition" + ], + "time": "2023-10-05T21:14:48+00:00" + } + ], + "packages-dev": [], + "aliases": [], + "minimum-stability": "stable", + "stability-flags": [], + "prefer-stable": false, + "prefer-lowest": false, + "platform": [], + "platform-dev": [], + "platform-overrides": { + "php": "7.2" + }, + "plugin-api-version": "1.1.0" +} diff --git a/tesseract/tesseract.php b/tesseract/tesseract.php new file mode 100644 index 00000000..3c61273f --- /dev/null +++ b/tesseract/tesseract.php @@ -0,0 +1,33 @@ + + */ + +use Friendica\Core\Hook; +use Friendica\Core\Logger; +use Friendica\Core\System; +use thiagoalessio\TesseractOCR\TesseractOCR; + +require_once __DIR__ . DIRECTORY_SEPARATOR . 'vendor' . DIRECTORY_SEPARATOR . 'autoload.php'; + +function tesseract_install() +{ + Hook::register('ocr-detection', __FILE__, 'tesseract_ocr_detection'); + + Logger::notice('installed tesseract'); +} + +function tesseract_ocr_detection(&$media) +{ + $ocr = new TesseractOCR(); + try { + $ocr->tempDir(System::getTempPath()); + $ocr->imageData($media['img_str'], strlen($media['img_str'])); + $media['description'] = $ocr->run(); + } catch (\Throwable $th) { + Logger::info('Error calling TesseractOCR', ['message' => $th->getMessage()]); + } +} diff --git a/tesseract/vendor/autoload.php b/tesseract/vendor/autoload.php new file mode 100644 index 00000000..1238ecea --- /dev/null +++ b/tesseract/vendor/autoload.php @@ -0,0 +1,7 @@ + + * Jordi Boggiano + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Composer\Autoload; + +/** + * ClassLoader implements a PSR-0, PSR-4 and classmap class loader. + * + * $loader = new \Composer\Autoload\ClassLoader(); + * + * // register classes with namespaces + * $loader->add('Symfony\Component', __DIR__.'/component'); + * $loader->add('Symfony', __DIR__.'/framework'); + * + * // activate the autoloader + * $loader->register(); + * + * // to enable searching the include path (eg. for PEAR packages) + * $loader->setUseIncludePath(true); + * + * In this example, if you try to use a class in the Symfony\Component + * namespace or one of its children (Symfony\Component\Console for instance), + * the autoloader will first look for the class under the component/ + * directory, and it will then fallback to the framework/ directory if not + * found before giving up. + * + * This class is loosely based on the Symfony UniversalClassLoader. + * + * @author Fabien Potencier + * @author Jordi Boggiano + * @see http://www.php-fig.org/psr/psr-0/ + * @see http://www.php-fig.org/psr/psr-4/ + */ +class ClassLoader +{ + // PSR-4 + private $prefixLengthsPsr4 = array(); + private $prefixDirsPsr4 = array(); + private $fallbackDirsPsr4 = array(); + + // PSR-0 + private $prefixesPsr0 = array(); + private $fallbackDirsPsr0 = array(); + + private $useIncludePath = false; + private $classMap = array(); + private $classMapAuthoritative = false; + private $missingClasses = array(); + private $apcuPrefix; + + public function getPrefixes() + { + if (!empty($this->prefixesPsr0)) { + return call_user_func_array('array_merge', array_values($this->prefixesPsr0)); + } + + return array(); + } + + public function getPrefixesPsr4() + { + return $this->prefixDirsPsr4; + } + + public function getFallbackDirs() + { + return $this->fallbackDirsPsr0; + } + + public function getFallbackDirsPsr4() + { + return $this->fallbackDirsPsr4; + } + + public function getClassMap() + { + return $this->classMap; + } + + /** + * @param array $classMap Class to filename map + */ + public function addClassMap(array $classMap) + { + if ($this->classMap) { + $this->classMap = array_merge($this->classMap, $classMap); + } else { + $this->classMap = $classMap; + } + } + + /** + * Registers a set of PSR-0 directories for a given prefix, either + * appending or prepending to the ones previously set for this prefix. + * + * @param string $prefix The prefix + * @param array|string $paths The PSR-0 root directories + * @param bool $prepend Whether to prepend the directories + */ + public function add($prefix, $paths, $prepend = false) + { + if (!$prefix) { + if ($prepend) { + $this->fallbackDirsPsr0 = array_merge( + (array) $paths, + $this->fallbackDirsPsr0 + ); + } else { + $this->fallbackDirsPsr0 = array_merge( + $this->fallbackDirsPsr0, + (array) $paths + ); + } + + return; + } + + $first = $prefix[0]; + if (!isset($this->prefixesPsr0[$first][$prefix])) { + $this->prefixesPsr0[$first][$prefix] = (array) $paths; + + return; + } + if ($prepend) { + $this->prefixesPsr0[$first][$prefix] = array_merge( + (array) $paths, + $this->prefixesPsr0[$first][$prefix] + ); + } else { + $this->prefixesPsr0[$first][$prefix] = array_merge( + $this->prefixesPsr0[$first][$prefix], + (array) $paths + ); + } + } + + /** + * Registers a set of PSR-4 directories for a given namespace, either + * appending or prepending to the ones previously set for this namespace. + * + * @param string $prefix The prefix/namespace, with trailing '\\' + * @param array|string $paths The PSR-4 base directories + * @param bool $prepend Whether to prepend the directories + * + * @throws \InvalidArgumentException + */ + public function addPsr4($prefix, $paths, $prepend = false) + { + if (!$prefix) { + // Register directories for the root namespace. + if ($prepend) { + $this->fallbackDirsPsr4 = array_merge( + (array) $paths, + $this->fallbackDirsPsr4 + ); + } else { + $this->fallbackDirsPsr4 = array_merge( + $this->fallbackDirsPsr4, + (array) $paths + ); + } + } elseif (!isset($this->prefixDirsPsr4[$prefix])) { + // Register directories for a new namespace. + $length = strlen($prefix); + if ('\\' !== $prefix[$length - 1]) { + throw new \InvalidArgumentException("A non-empty PSR-4 prefix must end with a namespace separator."); + } + $this->prefixLengthsPsr4[$prefix[0]][$prefix] = $length; + $this->prefixDirsPsr4[$prefix] = (array) $paths; + } elseif ($prepend) { + // Prepend directories for an already registered namespace. + $this->prefixDirsPsr4[$prefix] = array_merge( + (array) $paths, + $this->prefixDirsPsr4[$prefix] + ); + } else { + // Append directories for an already registered namespace. + $this->prefixDirsPsr4[$prefix] = array_merge( + $this->prefixDirsPsr4[$prefix], + (array) $paths + ); + } + } + + /** + * Registers a set of PSR-0 directories for a given prefix, + * replacing any others previously set for this prefix. + * + * @param string $prefix The prefix + * @param array|string $paths The PSR-0 base directories + */ + public function set($prefix, $paths) + { + if (!$prefix) { + $this->fallbackDirsPsr0 = (array) $paths; + } else { + $this->prefixesPsr0[$prefix[0]][$prefix] = (array) $paths; + } + } + + /** + * Registers a set of PSR-4 directories for a given namespace, + * replacing any others previously set for this namespace. + * + * @param string $prefix The prefix/namespace, with trailing '\\' + * @param array|string $paths The PSR-4 base directories + * + * @throws \InvalidArgumentException + */ + public function setPsr4($prefix, $paths) + { + if (!$prefix) { + $this->fallbackDirsPsr4 = (array) $paths; + } else { + $length = strlen($prefix); + if ('\\' !== $prefix[$length - 1]) { + throw new \InvalidArgumentException("A non-empty PSR-4 prefix must end with a namespace separator."); + } + $this->prefixLengthsPsr4[$prefix[0]][$prefix] = $length; + $this->prefixDirsPsr4[$prefix] = (array) $paths; + } + } + + /** + * Turns on searching the include path for class files. + * + * @param bool $useIncludePath + */ + public function setUseIncludePath($useIncludePath) + { + $this->useIncludePath = $useIncludePath; + } + + /** + * Can be used to check if the autoloader uses the include path to check + * for classes. + * + * @return bool + */ + public function getUseIncludePath() + { + return $this->useIncludePath; + } + + /** + * Turns off searching the prefix and fallback directories for classes + * that have not been registered with the class map. + * + * @param bool $classMapAuthoritative + */ + public function setClassMapAuthoritative($classMapAuthoritative) + { + $this->classMapAuthoritative = $classMapAuthoritative; + } + + /** + * Should class lookup fail if not found in the current class map? + * + * @return bool + */ + public function isClassMapAuthoritative() + { + return $this->classMapAuthoritative; + } + + /** + * APCu prefix to use to cache found/not-found classes, if the extension is enabled. + * + * @param string|null $apcuPrefix + */ + public function setApcuPrefix($apcuPrefix) + { + $this->apcuPrefix = function_exists('apcu_fetch') && filter_var(ini_get('apc.enabled'), FILTER_VALIDATE_BOOLEAN) ? $apcuPrefix : null; + } + + /** + * The APCu prefix in use, or null if APCu caching is not enabled. + * + * @return string|null + */ + public function getApcuPrefix() + { + return $this->apcuPrefix; + } + + /** + * Registers this instance as an autoloader. + * + * @param bool $prepend Whether to prepend the autoloader or not + */ + public function register($prepend = false) + { + spl_autoload_register(array($this, 'loadClass'), true, $prepend); + } + + /** + * Unregisters this instance as an autoloader. + */ + public function unregister() + { + spl_autoload_unregister(array($this, 'loadClass')); + } + + /** + * Loads the given class or interface. + * + * @param string $class The name of the class + * @return bool|null True if loaded, null otherwise + */ + public function loadClass($class) + { + if ($file = $this->findFile($class)) { + includeFile($file); + + return true; + } + } + + /** + * Finds the path to the file where the class is defined. + * + * @param string $class The name of the class + * + * @return string|false The path if found, false otherwise + */ + public function findFile($class) + { + // class map lookup + if (isset($this->classMap[$class])) { + return $this->classMap[$class]; + } + if ($this->classMapAuthoritative || isset($this->missingClasses[$class])) { + return false; + } + if (null !== $this->apcuPrefix) { + $file = apcu_fetch($this->apcuPrefix.$class, $hit); + if ($hit) { + return $file; + } + } + + $file = $this->findFileWithExtension($class, '.php'); + + // Search for Hack files if we are running on HHVM + if (false === $file && defined('HHVM_VERSION')) { + $file = $this->findFileWithExtension($class, '.hh'); + } + + if (null !== $this->apcuPrefix) { + apcu_add($this->apcuPrefix.$class, $file); + } + + if (false === $file) { + // Remember that this class does not exist. + $this->missingClasses[$class] = true; + } + + return $file; + } + + private function findFileWithExtension($class, $ext) + { + // PSR-4 lookup + $logicalPathPsr4 = strtr($class, '\\', DIRECTORY_SEPARATOR) . $ext; + + $first = $class[0]; + if (isset($this->prefixLengthsPsr4[$first])) { + $subPath = $class; + while (false !== $lastPos = strrpos($subPath, '\\')) { + $subPath = substr($subPath, 0, $lastPos); + $search = $subPath . '\\'; + if (isset($this->prefixDirsPsr4[$search])) { + $pathEnd = DIRECTORY_SEPARATOR . substr($logicalPathPsr4, $lastPos + 1); + foreach ($this->prefixDirsPsr4[$search] as $dir) { + if (file_exists($file = $dir . $pathEnd)) { + return $file; + } + } + } + } + } + + // PSR-4 fallback dirs + foreach ($this->fallbackDirsPsr4 as $dir) { + if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr4)) { + return $file; + } + } + + // PSR-0 lookup + if (false !== $pos = strrpos($class, '\\')) { + // namespaced class name + $logicalPathPsr0 = substr($logicalPathPsr4, 0, $pos + 1) + . strtr(substr($logicalPathPsr4, $pos + 1), '_', DIRECTORY_SEPARATOR); + } else { + // PEAR-like class name + $logicalPathPsr0 = strtr($class, '_', DIRECTORY_SEPARATOR) . $ext; + } + + if (isset($this->prefixesPsr0[$first])) { + foreach ($this->prefixesPsr0[$first] as $prefix => $dirs) { + if (0 === strpos($class, $prefix)) { + foreach ($dirs as $dir) { + if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr0)) { + return $file; + } + } + } + } + } + + // PSR-0 fallback dirs + foreach ($this->fallbackDirsPsr0 as $dir) { + if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr0)) { + return $file; + } + } + + // PSR-0 include paths. + if ($this->useIncludePath && $file = stream_resolve_include_path($logicalPathPsr0)) { + return $file; + } + + return false; + } +} + +/** + * Scope isolated include. + * + * Prevents access to $this/self from included files. + */ +function includeFile($file) +{ + include $file; +} diff --git a/tesseract/vendor/composer/LICENSE b/tesseract/vendor/composer/LICENSE new file mode 100644 index 00000000..f27399a0 --- /dev/null +++ b/tesseract/vendor/composer/LICENSE @@ -0,0 +1,21 @@ + +Copyright (c) Nils Adermann, Jordi Boggiano + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is furnished +to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + diff --git a/tesseract/vendor/composer/autoload_classmap.php b/tesseract/vendor/composer/autoload_classmap.php new file mode 100644 index 00000000..7a91153b --- /dev/null +++ b/tesseract/vendor/composer/autoload_classmap.php @@ -0,0 +1,9 @@ + array($vendorDir . '/thiagoalessio/tesseract_ocr/src'), +); diff --git a/tesseract/vendor/composer/autoload_real.php b/tesseract/vendor/composer/autoload_real.php new file mode 100644 index 00000000..10af9c56 --- /dev/null +++ b/tesseract/vendor/composer/autoload_real.php @@ -0,0 +1,55 @@ += 50600 && !defined('HHVM_VERSION') && (!function_exists('zend_loader_file_encoded') || !zend_loader_file_encoded()); + if ($useStaticLoader) { + require_once __DIR__ . '/autoload_static.php'; + + call_user_func(\Composer\Autoload\ComposerStaticInit695d781792f754383aa61632167d066e::getInitializer($loader)); + } else { + $map = require __DIR__ . '/autoload_namespaces.php'; + foreach ($map as $namespace => $path) { + $loader->set($namespace, $path); + } + + $map = require __DIR__ . '/autoload_psr4.php'; + foreach ($map as $namespace => $path) { + $loader->setPsr4($namespace, $path); + } + + $classMap = require __DIR__ . '/autoload_classmap.php'; + if ($classMap) { + $loader->addClassMap($classMap); + } + } + + $loader->register(true); + + return $loader; + } +} diff --git a/tesseract/vendor/composer/autoload_static.php b/tesseract/vendor/composer/autoload_static.php new file mode 100644 index 00000000..59b66053 --- /dev/null +++ b/tesseract/vendor/composer/autoload_static.php @@ -0,0 +1,31 @@ + + array ( + 'thiagoalessio\\TesseractOCR\\' => 27, + ), + ); + + public static $prefixDirsPsr4 = array ( + 'thiagoalessio\\TesseractOCR\\' => + array ( + 0 => __DIR__ . '/..' . '/thiagoalessio/tesseract_ocr/src', + ), + ); + + public static function getInitializer(ClassLoader $loader) + { + return \Closure::bind(function () use ($loader) { + $loader->prefixLengthsPsr4 = ComposerStaticInit695d781792f754383aa61632167d066e::$prefixLengthsPsr4; + $loader->prefixDirsPsr4 = ComposerStaticInit695d781792f754383aa61632167d066e::$prefixDirsPsr4; + + }, null, ClassLoader::class); + } +} diff --git a/tesseract/vendor/composer/installed.json b/tesseract/vendor/composer/installed.json new file mode 100644 index 00000000..70bcc01c --- /dev/null +++ b/tesseract/vendor/composer/installed.json @@ -0,0 +1,48 @@ +[ + { + "name": "thiagoalessio/tesseract_ocr", + "version": "2.13.0", + "version_normalized": "2.13.0.0", + "source": { + "type": "git", + "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git", + "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1", + "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1", + "shasum": "" + }, + "require": { + "php": "^5.3 || ^7.0 || ^8.0" + }, + "require-dev": { + "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0" + }, + "time": "2023-10-05T21:14:48+00:00", + "type": "library", + "installation-source": "dist", + "autoload": { + "psr-4": { + "thiagoalessio\\TesseractOCR\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "thiagoalessio", + "email": "thiagoalessio@me.com" + } + ], + "description": "A wrapper to work with Tesseract OCR inside PHP.", + "keywords": [ + "OCR", + "Tesseract", + "text recognition" + ] + } +] diff --git a/tesseract/vendor/thiagoalessio/tesseract_ocr/.appveyor.yml b/tesseract/vendor/thiagoalessio/tesseract_ocr/.appveyor.yml new file mode 100644 index 00000000..1debc1a1 --- /dev/null +++ b/tesseract/vendor/thiagoalessio/tesseract_ocr/.appveyor.yml @@ -0,0 +1,14 @@ +--- +build: false + +install: + - ps: Set-Service wuauserv -StartupType Manual + - choco install php + - choco install capture2text --version 3.9 + - choco install composer + - refreshenv + - cd %APPVEYOR_BUILD_FOLDER% + - composer install + +test_script: + - php tests\run.php unit e2e diff --git a/tesseract/vendor/thiagoalessio/tesseract_ocr/MIT-LICENSE b/tesseract/vendor/thiagoalessio/tesseract_ocr/MIT-LICENSE new file mode 100644 index 00000000..448104d6 --- /dev/null +++ b/tesseract/vendor/thiagoalessio/tesseract_ocr/MIT-LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2012-2021 Thiago Alessio Pereira + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/tesseract/vendor/thiagoalessio/tesseract_ocr/README.md b/tesseract/vendor/thiagoalessio/tesseract_ocr/README.md new file mode 100644 index 00000000..b828344c --- /dev/null +++ b/tesseract/vendor/thiagoalessio/tesseract_ocr/README.md @@ -0,0 +1,508 @@ +Tesseract OCR for PHP + +# Tesseract OCR for PHP + +A wrapper to work with Tesseract OCR inside PHP. + +[![CI][ci_badge]][ci] +[![AppVeyor][appveyor_badge]][appveyor] +[![Codacy][codacy_badge]][codacy] +[![Test Coverage][test_coverage_badge]][test_coverage] +
+[![Latest Stable Version][stable_version_badge]][packagist] +[![Total Downloads][total_downloads_badge]][packagist] +[![Monthly Downloads][monthly_downloads_badge]][packagist] + +## Installation + +Via [Composer][]: + + $ composer require thiagoalessio/tesseract_ocr + +:bangbang: **This library depends on [Tesseract OCR][], version _3.02_ or later.** + +
+ +### ![][windows_icon] Note for Windows users + +There are [many ways][tesseract_installation_on_windows] to install +[Tesseract OCR][] on your system, but if you just want something quick to +get up and running, I recommend installing the [Capture2Text][] package with +[Chocolatey][]. + + choco install capture2text --version 3.9 + +:warning: Recent versions of [Capture2Text][] stopped shipping the `tesseract` binary. + +
+ +### ![][macos_icon] Note for macOS users + +With [MacPorts][] you can install support for individual languages, like so: + + $ sudo port install tesseract- + +But that is not possible with [Homebrew][]. It comes only with **English** support +by default, so if you intend to use it for other language, the quickest solution +is to install them all: + + $ brew install tesseract tesseract-lang + +
+ +## Usage + +### Basic usage + + + +```php +use thiagoalessio\TesseractOCR\TesseractOCR; +echo (new TesseractOCR('text.png')) + ->run(); +``` + +``` +The quick brown fox +jumps over +the lazy dog. +``` + +
+ +### Other languages + + + +```php +use thiagoalessio\TesseractOCR\TesseractOCR; +echo (new TesseractOCR('german.png')) + ->lang('deu') + ->run(); +``` + +``` +Bülowstraße +``` + +
+ +### Multiple languages + + + +```php +use thiagoalessio\TesseractOCR\TesseractOCR; +echo (new TesseractOCR('mixed-languages.png')) + ->lang('eng', 'jpn', 'spa') + ->run(); +``` + +``` +I eat すし y Pollo +``` + +
+ +### Inducing recognition + + + +```php +use thiagoalessio\TesseractOCR\TesseractOCR; +echo (new TesseractOCR('8055.png')) + ->allowlist(range('A', 'Z')) + ->run(); +``` + +``` +BOSS +``` + +
+ +### Breaking CAPTCHAs + +Yes, I know some of you might want to use this library for the *noble* purpose +of breaking CAPTCHAs, so please take a look at this comment: + + + +## API + +### run + +Executes a `tesseract` command, optionally receiving an integer as `timeout`, +in case you experience stalled tesseract processes. + +```php +$ocr = new TesseractOCR(); +$ocr->run(); +``` +```php +$ocr = new TesseractOCR(); +$timeout = 500; +$ocr->run($timeout); +``` + +### image + +Define the path of an image to be recognized by `tesseract`. + +```php +$ocr = new TesseractOCR(); +$ocr->image('/path/to/image.png'); +$ocr->run(); +``` + +### imageData + +Set the image to be recognized by `tesseract` from a string, with its size. +This can be useful when dealing with files that are already loaded in memory. +You can easily retrieve the image data and size of an image object : +```php +//Using Imagick +$data = $img->getImageBlob(); +$size = $img->getImageLength(); +//Using GD +ob_start(); +// Note that you can use any format supported by tesseract +imagepng($img, null, 0); +$size = ob_get_length(); +$data = ob_get_clean(); + +$ocr = new TesseractOCR(); +$ocr->imageData($data, $size); +$ocr->run(); +``` + +### executable + +Define a custom location of the `tesseract` executable, +if by any reason it is not present in the `$PATH`. + +```php +echo (new TesseractOCR('img.png')) + ->executable('/path/to/tesseract') + ->run(); +``` + +### version + +Returns the current version of `tesseract`. + +```php +echo (new TesseractOCR())->version(); +``` + +### availableLanguages + +Returns a list of available languages/scripts. + +```php +foreach((new TesseractOCR())->availableLanguages() as $lang) echo $lang; +``` + +__More info:__ + +### tessdataDir + +Specify a custom location for the tessdata directory. + +```php +echo (new TesseractOCR('img.png')) + ->tessdataDir('/path') + ->run(); +``` + +### userWords + +Specify the location of user words file. + +This is a plain text file containing a list of words that you want to be +considered as a normal dictionary words by `tesseract`. + +Useful when dealing with contents that contain technical terminology, jargon, +etc. + +``` +$ cat /path/to/user-words.txt +foo +bar +``` + +```php +echo (new TesseractOCR('img.png')) + ->userWords('/path/to/user-words.txt') + ->run(); +``` + +### userPatterns + +Specify the location of user patterns file. + +If the contents you are dealing with have known patterns, this option can help +a lot tesseract's recognition accuracy. + +``` +$ cat /path/to/user-patterns.txt' +1-\d\d\d-GOOG-441 +www.\n\\\*.com +``` + +```php +echo (new TesseractOCR('img.png')) + ->userPatterns('/path/to/user-patterns.txt') + ->run(); +``` + +### lang + +Define one or more languages to be used during the recognition. +A complete list of available languages can be found at: + + +__Tip from [@daijiale][]:__ Use the combination `->lang('chi_sim', 'chi_tra')` +for proper recognition of Chinese. + +```php + echo (new TesseractOCR('img.png')) + ->lang('lang1', 'lang2', 'lang3') + ->run(); +``` + +### psm + +Specify the Page Segmentation Method, which instructs `tesseract` how to +interpret the given image. + +__More info:__ + +```php +echo (new TesseractOCR('img.png')) + ->psm(6) + ->run(); +``` + +### oem + +Specify the OCR Engine Mode. (see `tesseract --help-oem`) + +```php +echo (new TesseractOCR('img.png')) + ->oem(2) + ->run(); +``` + +### dpi + +Specify the image DPI. It is useful if your image does not contain this information in its metadata. + +```php +echo (new TesseractOCR('img.png')) + ->dpi(300) + ->run(); +``` + +### allowlist + +This is a shortcut for `->config('tessedit_char_whitelist', 'abcdef....')`. + +```php +echo (new TesseractOCR('img.png')) + ->allowlist(range('a', 'z'), range(0, 9), '-_@') + ->run(); +``` + +### configFile + +Specify a config file to be used. It can either be the path to your own +config file or the name of one of the predefined config files: + + +```php +echo (new TesseractOCR('img.png')) + ->configFile('hocr') + ->run(); +``` + +### setOutputFile + +Specify an Outputfile to be used. Be aware: If you set an outputfile then +the option `withoutTempFiles` is ignored. +Tempfiles are written (and deleted) even if `withoutTempFiles = true`. + +In combination with `configFile` you are able to get the `hocr`, `tsv` or +`pdf` files. + +```php +echo (new TesseractOCR('img.png')) + ->configFile('pdf') + ->setOutputFile('/PATH_TO_MY_OUTPUTFILE/searchable.pdf') + ->run(); +``` + +### digits + +Shortcut for `->configFile('digits')`. + +```php +echo (new TesseractOCR('img.png')) + ->digits() + ->run(); +``` + +### hocr + +Shortcut for `->configFile('hocr')`. + +```php +echo (new TesseractOCR('img.png')) + ->hocr() + ->run(); +``` + +### pdf + +Shortcut for `->configFile('pdf')`. + +```php +echo (new TesseractOCR('img.png')) + ->pdf() + ->run(); +``` + +### quiet + +Shortcut for `->configFile('quiet')`. + +```php +echo (new TesseractOCR('img.png')) + ->quiet() + ->run(); +``` + +### tsv + +Shortcut for `->configFile('tsv')`. + +```php +echo (new TesseractOCR('img.png')) + ->tsv() + ->run(); +``` + +### txt + +Shortcut for `->configFile('txt')`. + +```php +echo (new TesseractOCR('img.png')) + ->txt() + ->run(); +``` + +### tempDir + +Define a custom directory to store temporary files generated by tesseract. +Make sure the directory actually exists and the user running `php` is allowed +to write in there. + +```php +echo (new TesseractOCR('img.png')) + ->tempDir('./my/custom/temp/dir') + ->run(); +``` + +### withoutTempFiles + +Specify that `tesseract` should output the recognized text without writing to temporary files. +The data is gathered from the standard output of `tesseract` instead. + +```php +echo (new TesseractOCR('img.png')) + ->withoutTempFiles() + ->run(); +``` + +### Other options + +Any configuration option offered by Tesseract can be used like that: + +```php +echo (new TesseractOCR('img.png')) + ->config('config_var', 'value') + ->config('other_config_var', 'other value') + ->run(); +``` + +Or like that: + +```php +echo (new TesseractOCR('img.png')) + ->configVar('value') + ->otherConfigVar('other value') + ->run(); +``` + +__More info:__ + +### Thread-limit + +Sometimes, it may be useful to limit the number of threads that tesseract is +allowed to use (e.g. in [this case](https://github.com/tesseract-ocr/tesseract/issues/898)). +Set the maxmium number of threads as param for the `run` function: + +```php +echo (new TesseractOCR('img.png')) + ->threadLimit(1) + ->run(); +``` + +## How to contribute + +You can contribute to this project by: + +* Opening an [Issue][] if you found a bug or wish to propose a new feature; +* Placing a [Pull Request][] with code that fix a bug, missing/wrong documentation + or implement a new feature; + +Just make sure you take a look at our [Code of Conduct][] and [Contributing][] +instructions. + +## License + +tesseract-ocr-for-php is released under the [MIT License][]. + + +

Made with love in Berlin

+ +[ci_badge]: https://github.com/thiagoalessio/tesseract-ocr-for-php/workflows/CI/badge.svg?event=push&branch=main +[ci]: https://github.com/thiagoalessio/tesseract-ocr-for-php/actions?query=workflow%3ACI +[appveyor_badge]: https://ci.appveyor.com/api/projects/status/xwy5ls0798iwcim3/branch/main?svg=true +[appveyor]: https://ci.appveyor.com/project/thiagoalessio/tesseract-ocr-for-php/branch/main +[codacy_badge]: https://app.codacy.com/project/badge/Grade/a81aa10012874f23a57df5b492d835f2 +[codacy]: https://www.codacy.com/gh/thiagoalessio/tesseract-ocr-for-php/dashboard +[test_coverage_badge]: https://codecov.io/gh/thiagoalessio/tesseract-ocr-for-php/branch/main/graph/badge.svg?token=Y0VnrqiSIf +[test_coverage]: https://codecov.io/gh/thiagoalessio/tesseract-ocr-for-php +[stable_version_badge]: https://img.shields.io/packagist/v/thiagoalessio/tesseract_ocr.svg +[packagist]: https://packagist.org/packages/thiagoalessio/tesseract_ocr +[total_downloads_badge]: https://img.shields.io/packagist/dt/thiagoalessio/tesseract_ocr.svg +[monthly_downloads_badge]: https://img.shields.io/packagist/dm/thiagoalessio/tesseract_ocr.svg +[Tesseract OCR]: https://github.com/tesseract-ocr/tesseract +[Composer]: http://getcomposer.org/ +[windows_icon]: https://thiagoalessio.github.io/tesseract-ocr-for-php/images/windows-18.svg +[macos_icon]: https://thiagoalessio.github.io/tesseract-ocr-for-php/images/apple-18.svg +[tesseract_installation_on_windows]: https://github.com/tesseract-ocr/tesseract/wiki#windows +[Capture2Text]: https://chocolatey.org/packages/capture2text +[Chocolatey]: https://chocolatey.org +[MacPorts]: https://www.macports.org +[Homebrew]: https://brew.sh +[@daijiale]: https://github.com/daijiale +[HOCR]: https://github.com/tesseract-ocr/tesseract/wiki/Command-Line-Usage#hocr-output +[TSV]: https://github.com/tesseract-ocr/tesseract/wiki/Command-Line-Usage#tsv-output-currently-available-in-305-dev-in-master-branch-on-github +[Issue]: https://github.com/thiagoalessio/tesseract-ocr-for-php/issues +[Pull Request]: https://github.com/thiagoalessio/tesseract-ocr-for-php/pulls +[Code of Conduct]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/.github/CODE_OF_CONDUCT.md +[Contributing]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/.github/CONTRIBUTING.md +[MIT License]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/MIT-LICENSE diff --git a/tesseract/vendor/thiagoalessio/tesseract_ocr/codecov.yml b/tesseract/vendor/thiagoalessio/tesseract_ocr/codecov.yml new file mode 100644 index 00000000..8fd4c921 --- /dev/null +++ b/tesseract/vendor/thiagoalessio/tesseract_ocr/codecov.yml @@ -0,0 +1,4 @@ +fixes: +- "/home/runner/work/tesseract-ocr-for-php/tesseract-ocr-for-php/::" +- "/Users/runner/work/tesseract-ocr-for-php/tesseract-ocr-for-php/::" +- "C:\\projects\\tesseract-ocr-for-php\\::" diff --git a/tesseract/vendor/thiagoalessio/tesseract_ocr/composer.json b/tesseract/vendor/thiagoalessio/tesseract_ocr/composer.json new file mode 100644 index 00000000..9a07e6d5 --- /dev/null +++ b/tesseract/vendor/thiagoalessio/tesseract_ocr/composer.json @@ -0,0 +1,35 @@ +{ + "name": "thiagoalessio/tesseract_ocr", + "description": "A wrapper to work with Tesseract OCR inside PHP.", + "version": "2.13.0", + "type": "library", + "keywords": ["Tesseract", "OCR", "text recognition"], + "license": "MIT", + "authors": [ + { + "name": "thiagoalessio", + "email": "thiagoalessio@me.com" + } + ], + "support": { + "issues": "https://github.com/thiagoalessio/tesseract-ocr-for-php/issues", + "irc": "irc://irc.freenode.net/tesseract-ocr-for-php", + "source": "https://github.com/thiagoalessio/tesseract-ocr-for-php" + }, + "require": { + "php": "^5.3 || ^7.0 || ^8.0" + }, + "require-dev": { + "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0" + }, + "autoload": { + "psr-4": { + "thiagoalessio\\TesseractOCR\\": "src/" + } + }, + "autoload-dev": { + "psr-4": { + "thiagoalessio\\TesseractOCR\\Tests\\": "tests/" + } + } +} diff --git a/tesseract/vendor/thiagoalessio/tesseract_ocr/src/Command.php b/tesseract/vendor/thiagoalessio/tesseract_ocr/src/Command.php new file mode 100644 index 00000000..ad123e82 --- /dev/null +++ b/tesseract/vendor/thiagoalessio/tesseract_ocr/src/Command.php @@ -0,0 +1,80 @@ +image = $image; + $this->outputFile = $outputFile; + } + + public function build() { return "$this"; } + + public function __toString() + { + $cmd = array(); + if ($this->threadLimit) $cmd[] = "OMP_THREAD_LIMIT={$this->threadLimit}"; + $cmd[] = self::escape($this->executable); + $cmd[] = $this->useFileAsInput ? self::escape($this->image) : "-"; + $cmd[] = $this->useFileAsOutput ? self::escape($this->getOutputFile(false)) : "-"; + + $version = $this->getTesseractVersion(); + + foreach ($this->options as $option) { + $cmd[] = is_callable($option) ? $option($version) : "$option"; + } + if ($this->configFile) $cmd[] = $this->configFile; + + return join(' ', $cmd); + } + + public function getOutputFile($withExt=true) + { + if (!$this->outputFile) + $this->outputFile = $this->getTempDir() + .DIRECTORY_SEPARATOR + .basename(tempnam($this->getTempDir(), 'ocr')); + if (!$withExt) return $this->outputFile; + + $hasCustomExt = array('hocr', 'tsv', 'pdf'); + $ext = in_array($this->configFile, $hasCustomExt) ? $this->configFile : 'txt'; + return "{$this->outputFile}.{$ext}"; + } + + public function getTempDir() + { + return $this->tempDir ?: sys_get_temp_dir(); + } + + public function getTesseractVersion() + { + exec(self::escape($this->executable).' --version 2>&1', $output); + $outputParts = explode(' ', $output[0]); + return $outputParts[1]; + } + + public function getAvailableLanguages() + { + exec(self::escape($this->executable) . ' --list-langs 2>&1', $output); + array_shift($output); + sort($output); + return $output; + } + + public static function escape($str) + { + $charlist = strtoupper(substr(PHP_OS, 0, 3)) == 'WIN' ? '$"`' : '$"\\`'; + return '"'.addcslashes($str, $charlist).'"'; + } +} diff --git a/tesseract/vendor/thiagoalessio/tesseract_ocr/src/FeatureNotAvailableException.php b/tesseract/vendor/thiagoalessio/tesseract_ocr/src/FeatureNotAvailableException.php new file mode 100644 index 00000000..12264f5c --- /dev/null +++ b/tesseract/vendor/thiagoalessio/tesseract_ocr/src/FeatureNotAvailableException.php @@ -0,0 +1,7 @@ + NUL 2>&1' + : 'type '.Command::escape($executable).' > /dev/null 2>&1'; + system($cmd, $exitCode); + + if ($exitCode == 0) return; + + $currentPath = getenv('PATH'); + $msg = array(); + $msg[] = "Error! The command \"$executable\" was not found."; + $msg[] = ''; + $msg[] = 'Make sure you have Tesseract OCR installed on your system:'; + $msg[] = 'https://github.com/tesseract-ocr/tesseract'; + $msg[] = ''; + $msg[] = "The current \$PATH is $currentPath"; + $msg = join(PHP_EOL, $msg); + + throw new TesseractNotFoundException($msg); + } + + public static function checkCommandExecution($command, $stdout, $stderr) + { + if ($command->useFileAsOutput) { + $file = $command->getOutputFile(); + if (file_exists($file) && filesize($file) > 0) return; + } + + if (!$command->useFileAsOutput && $stdout) { + return; + } + + $msg = array(); + $msg[] = 'Error! The command did not produce any output.'; + $msg[] = ''; + $msg[] = 'Generated command:'; + $msg[] = "$command"; + $msg[] = ''; + $msg[] = 'Returned message:'; + $arrayStderr = explode(PHP_EOL, $stderr); + array_pop($arrayStderr); + $msg = array_merge($msg, $arrayStderr); + $msg = join(PHP_EOL, $msg); + + throw new UnsuccessfulCommandException($msg); + } + + public static function checkProcessCreation($processHandle, $command) + { + if ($processHandle !== FALSE) return; + + $msg = array(); + $msg[] = 'Error! The command could not be launched.'; + $msg[] = ''; + $msg[] = 'Generated command:'; + $msg[] = "$command"; + $msg = join(PHP_EOL, $msg); + + throw new UnsuccessfulCommandException($msg); + } + + public static function checkTesseractVersion($expected, $action, $command) + { + $actual = $command->getTesseractVersion(); + + if ($actual[0] === 'v') + $actual = substr($actual, 1); + + if (version_compare($actual, $expected, ">=")) return; + + $msg = array(); + $msg[] = "Error! $action is not available this tesseract version"; + $msg[] = "Required version is $expected, actual version is $actual"; + $msg[] = ''; + $msg[] = 'Generated command:'; + $msg[] = "$command"; + $msg = join(PHP_EOL, $msg); + + throw new FeatureNotAvailableException($msg); + } + + public static function checkWritePermissions($path) + { + if (!is_dir(dirname($path))) mkdir(dirname($path)); + $writableDirectory = is_writable(dirname($path)); + $writableFile = true; + if (file_exists($path)) $writableFile = is_writable($path); + if ($writableFile && $writableDirectory) return; + + $msg = array(); + $msg[] = "Error! No permission to write to $path"; + $msg[] = "Make sure you have the right outputFile and permissions " + ."to write to the folder"; + $msg[] = ''; + $msg = join(PHP_EOL, $msg); + + throw new NoWritePermissionsForOutputFile($msg); + } +} diff --git a/tesseract/vendor/thiagoalessio/tesseract_ocr/src/ImageNotFoundException.php b/tesseract/vendor/thiagoalessio/tesseract_ocr/src/ImageNotFoundException.php new file mode 100644 index 00000000..2ba7df64 --- /dev/null +++ b/tesseract/vendor/thiagoalessio/tesseract_ocr/src/ImageNotFoundException.php @@ -0,0 +1,7 @@ +=') ? '-' : '')."-psm $psm"; + }; + } + + public static function oem($oem) + { + return function($version) use ($oem) { + Option::checkMinVersion('3.05', $version, 'oem'); + return "--oem $oem"; + }; + } + + public static function dpi($dpi) + { + return function() use ($dpi) { + return "--dpi $dpi"; + }; + } + + public static function userWords($path) + { + return function($version) use ($path) { + Option::checkMinVersion('3.04', $version, 'user-words'); + return '--user-words "'.addcslashes($path, '\\"').'"'; + }; + } + + public static function userPatterns($path) + { + return function($version) use ($path) { + Option::checkMinVersion('3.04', $version, 'user-patterns'); + return '--user-patterns "'.addcslashes($path, '\\"').'"'; + }; + } + + public static function tessdataDir($path) + { + return function() use ($path) { + return '--tessdata-dir "'.addcslashes($path, '\\"').'"'; + }; + } + + public static function lang() + { + $languages = func_get_args(); + return function() use ($languages) { + return '-l '.join('+', $languages); + }; + } + + public static function config($var, $value) + { + return function() use($var, $value) { + $snakeCase = function($str) { + return strtolower(preg_replace('/([A-Z])+/', '_$1', $str)); + }; + $pair = $snakeCase($var).'='.$value; + return '-c "'.addcslashes($pair, '\\"').'"'; + }; + } + + public static function checkMinVersion($minVersion, $currVersion, $option) + { + $minVersion = preg_replace('/^v/', '', $minVersion); + $currVersion = preg_replace('/^v/', '', $currVersion); + if (!version_compare($currVersion, $minVersion, '<')) return; + $msg = "$option option is only available on Tesseract $minVersion or later."; + $msg.= PHP_EOL."Your version of Tesseract is $currVersion"; + throw new \Exception($msg); + } +} diff --git a/tesseract/vendor/thiagoalessio/tesseract_ocr/src/Process.php b/tesseract/vendor/thiagoalessio/tesseract_ocr/src/Process.php new file mode 100644 index 00000000..38460eb0 --- /dev/null +++ b/tesseract/vendor/thiagoalessio/tesseract_ocr/src/Process.php @@ -0,0 +1,83 @@ +startTime = microtime(true); + $streamDescriptors = [ + array("pipe", "r"), + array("pipe", "w"), + array("pipe", "w") + ]; + $this->handle = proc_open($command, $streamDescriptors, $pipes, NULL, NULL, ["bypass_shell" => true]); + list($this->stdin, $this->stdout, $this->stderr) = $pipes; + + FriendlyErrors::checkProcessCreation($this->handle, $command); + + //This is can avoid deadlock on some cases (when stderr buffer is filled up before writing to stdout and vice-versa) + stream_set_blocking($this->stdout, 0); + stream_set_blocking($this->stderr, 0); + } + + public function write($data, $len) + { + $total = 0; + do + { + $res = fwrite($this->stdin, substr($data, $total)); + } while($res && $total += $res < $len); + return $total === $len; + } + + + public function wait($timeout = 0) + { + $running = true; + $data = ["out" => "", "err" => ""]; + while (($running === true) && !$this->hasTimedOut($timeout)) + { + $data["out"] .= fread($this->stdout, 8192); + $data["err"] .= fread($this->stderr, 8192); + $procInfo = proc_get_status($this->handle); + $running = $procInfo["running"]; + if ($running) { + usleep(1000); // Sleep 1ms to yield CPU time + } + } + return $data; + } + + public function close() + { + $this->closeStream($this->stdin); + $this->closeStream($this->stdout); + $this->closeStream($this->stderr); + return proc_close($this->handle); + } + + public function closeStdin() + { + $this->closeStream($this->stdin); + } + + private function hasTimedOut($timeout) + { + return (($timeout > 0) && ($this->startTime + $timeout < microtime(true))); + } + + private function closeStream(&$stream) + { + if ($stream !== NULL) + { + fclose($stream); + $stream = NULL; + } + } +} diff --git a/tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractNotFoundException.php b/tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractNotFoundException.php new file mode 100644 index 00000000..7b7f0c1e --- /dev/null +++ b/tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractNotFoundException.php @@ -0,0 +1,7 @@ +command = $command ?: new Command; + $this->image("$image"); + } + + public function run($timeout = 0) + { + try { + if ($this->outputFile !== null) { + FriendlyErrors::checkWritePermissions($this->outputFile); + $this->command->useFileAsOutput = true; + } + + FriendlyErrors::checkTesseractPresence($this->command->executable); + if ($this->command->useFileAsInput) { + FriendlyErrors::checkImagePath($this->command->image); + } + + $process = new Process("{$this->command}"); + + if (!$this->command->useFileAsInput) { + $process->write($this->command->image, $this->command->imageSize); + $process->closeStdin(); + } + $output = $process->wait($timeout); + + FriendlyErrors::checkCommandExecution($this->command, $output["out"], $output["err"]); + } + catch (TesseractOcrException $e) { + if ($this->command->useFileAsOutput) $this->cleanTempFiles(); + throw $e; + } + + if ($this->command->useFileAsOutput) { + $text = file_get_contents($this->command->getOutputFile()); + + if ($this->outputFile !== null) { + rename($this->command->getOutputFile(), $this->outputFile); + } + + $this->cleanTempFiles(); + } + else + $text = $output["out"]; + + return trim($text, " \t\n\r\0\x0A\x0B\x0C"); + } + + public function imageData($image, $size) + { + FriendlyErrors::checkTesseractVersion("3.03-rc1", "Reading image data from stdin", $this->command); + $this->command->useFileAsInput = false; + $this->command->image = $image; + $this->command->imageSize = $size; + return $this; + } + + public function withoutTempFiles() + { + FriendlyErrors::checkTesseractVersion("3.03-rc1", "Writing to stdout (without using temp files)", $this->command); + $this->command->useFileAsOutput = false; + return $this; + } + + public function image($image) + { + $this->command->image = $image; + return $this; + } + + public function executable($executable) + { + FriendlyErrors::checkTesseractPresence($executable); + $this->command->executable = $executable; + return $this; + } + + public function configFile($configFile) + { + $this->command->configFile = $configFile; + return $this; + } + + public function tempDir($tempDir) + { + $this->command->tempDir = $tempDir; + return $this; + } + + public function threadLimit($limit) + { + $this->command->threadLimit = $limit; + return $this; + } + + // @deprecated + public function format($fmt) { return $this->configFile($fmt); } + + public function setOutputFile($path) { + $this->outputFile = $path; + return $this; + } + + public function allowlist() + { + $concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; }; + $allowlist = join('', array_map($concat, func_get_args())); + $this->command->options[] = Option::config('tessedit_char_whitelist', $allowlist); + return $this; + } + + public function whitelist() + { + $warningMsg = 'Notice: whitelist is deprecated, use allowlist instead.'; + trigger_error($warningMsg, E_USER_NOTICE); + + $concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; }; + $allowlist = join('', array_map($concat, func_get_args())); + return $this->allowlist($allowlist); + } + + public function version() + { + return $this->command->getTesseractVersion(); + } + + public function availableLanguages() + { + return $this->command->getAvailableLanguages(); + } + + public function __call($method, $args) + { + if ($this->isConfigFile($method)) return $this->configFile($method); + if ($this->isOption($method)) { + $option = $this->getOptionClassName().'::'.$method; + $this->command->options[] = call_user_func_array($option, $args); + return $this; + } + $arg = empty($args) ? null : $args[0]; + $this->command->options[] = Option::config($method, $arg); + return $this; + } + + private function isConfigFile($name) + { + return in_array($name, array('digits', 'hocr', 'pdf', 'quiet', 'tsv', 'txt')); + } + + private function isOption($name) + { + return in_array($name, get_class_methods($this->getOptionClassName())); + } + + private function getOptionClassName() + { + return __NAMESPACE__.'\\Option'; + } + + private function cleanTempFiles() + { + if (file_exists($this->command->getOutputFile(false))) { + unlink($this->command->getOutputFile(false)); + } + if (file_exists($this->command->getOutputFile(true))) { + unlink($this->command->getOutputFile(true)); + } + } +} diff --git a/tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractOcrException.php b/tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractOcrException.php new file mode 100644 index 00000000..8c078616 --- /dev/null +++ b/tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractOcrException.php @@ -0,0 +1,7 @@ + Date: Mon, 15 Jan 2024 20:09:03 +0000 Subject: [PATCH 2/2] README.md created --- tesseract/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 tesseract/README.md diff --git a/tesseract/README.md b/tesseract/README.md new file mode 100644 index 00000000..d0b5eee2 --- /dev/null +++ b/tesseract/README.md @@ -0,0 +1 @@ +To make the addon work, you have to install the tesseract-ocr command line tool. \ No newline at end of file