Merge pull request 'New addon "tesseract" for OCR' (#1457) from heluecht/friendica-addons:tesseract into develop
Reviewed-on: https://git.friendi.ca/friendica/friendica-addons/pulls/1457pull/1458/head
commit
2b391eb368
|
@ -0,0 +1 @@
|
||||||
|
To make the addon work, you have to install the tesseract-ocr command line tool.
|
|
@ -0,0 +1,5 @@
|
||||||
|
{
|
||||||
|
"require": {
|
||||||
|
"thiagoalessio/tesseract_ocr": "^2.13"
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,66 @@
|
||||||
|
{
|
||||||
|
"_readme": [
|
||||||
|
"This file locks the dependencies of your project to a known state",
|
||||||
|
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
||||||
|
"This file is @generated automatically"
|
||||||
|
],
|
||||||
|
"content-hash": "778b5479cb5d2b31b57f40473a87f8eb",
|
||||||
|
"packages": [
|
||||||
|
{
|
||||||
|
"name": "thiagoalessio/tesseract_ocr",
|
||||||
|
"version": "2.13.0",
|
||||||
|
"source": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git",
|
||||||
|
"reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1"
|
||||||
|
},
|
||||||
|
"dist": {
|
||||||
|
"type": "zip",
|
||||||
|
"url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
|
||||||
|
"reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
|
||||||
|
"shasum": ""
|
||||||
|
},
|
||||||
|
"require": {
|
||||||
|
"php": "^5.3 || ^7.0 || ^8.0"
|
||||||
|
},
|
||||||
|
"require-dev": {
|
||||||
|
"phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
|
||||||
|
},
|
||||||
|
"type": "library",
|
||||||
|
"autoload": {
|
||||||
|
"psr-4": {
|
||||||
|
"thiagoalessio\\TesseractOCR\\": "src/"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"notification-url": "https://packagist.org/downloads/",
|
||||||
|
"license": [
|
||||||
|
"MIT"
|
||||||
|
],
|
||||||
|
"authors": [
|
||||||
|
{
|
||||||
|
"name": "thiagoalessio",
|
||||||
|
"email": "thiagoalessio@me.com"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "A wrapper to work with Tesseract OCR inside PHP.",
|
||||||
|
"keywords": [
|
||||||
|
"OCR",
|
||||||
|
"Tesseract",
|
||||||
|
"text recognition"
|
||||||
|
],
|
||||||
|
"time": "2023-10-05T21:14:48+00:00"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"packages-dev": [],
|
||||||
|
"aliases": [],
|
||||||
|
"minimum-stability": "stable",
|
||||||
|
"stability-flags": [],
|
||||||
|
"prefer-stable": false,
|
||||||
|
"prefer-lowest": false,
|
||||||
|
"platform": [],
|
||||||
|
"platform-dev": [],
|
||||||
|
"platform-overrides": {
|
||||||
|
"php": "7.2"
|
||||||
|
},
|
||||||
|
"plugin-api-version": "1.1.0"
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
<?php
|
||||||
|
/**
|
||||||
|
* Name: Tesseract OCR
|
||||||
|
* Description: Use OCR to get text from images
|
||||||
|
* Version: 0.1
|
||||||
|
* Author: Michael Vogel <http://pirati.ca/profile/heluecht>
|
||||||
|
*/
|
||||||
|
|
||||||
|
use Friendica\Core\Hook;
|
||||||
|
use Friendica\Core\Logger;
|
||||||
|
use Friendica\Core\System;
|
||||||
|
use thiagoalessio\TesseractOCR\TesseractOCR;
|
||||||
|
|
||||||
|
require_once __DIR__ . DIRECTORY_SEPARATOR . 'vendor' . DIRECTORY_SEPARATOR . 'autoload.php';
|
||||||
|
|
||||||
|
function tesseract_install()
|
||||||
|
{
|
||||||
|
Hook::register('ocr-detection', __FILE__, 'tesseract_ocr_detection');
|
||||||
|
|
||||||
|
Logger::notice('installed tesseract');
|
||||||
|
}
|
||||||
|
|
||||||
|
function tesseract_ocr_detection(&$media)
|
||||||
|
{
|
||||||
|
$ocr = new TesseractOCR();
|
||||||
|
try {
|
||||||
|
$ocr->tempDir(System::getTempPath());
|
||||||
|
$ocr->imageData($media['img_str'], strlen($media['img_str']));
|
||||||
|
$media['description'] = $ocr->run();
|
||||||
|
} catch (\Throwable $th) {
|
||||||
|
Logger::info('Error calling TesseractOCR', ['message' => $th->getMessage()]);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,7 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
// autoload.php @generated by Composer
|
||||||
|
|
||||||
|
require_once __DIR__ . '/composer/autoload_real.php';
|
||||||
|
|
||||||
|
return ComposerAutoloaderInit695d781792f754383aa61632167d066e::getLoader();
|
|
@ -0,0 +1,445 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This file is part of Composer.
|
||||||
|
*
|
||||||
|
* (c) Nils Adermann <naderman@naderman.de>
|
||||||
|
* Jordi Boggiano <j.boggiano@seld.be>
|
||||||
|
*
|
||||||
|
* For the full copyright and license information, please view the LICENSE
|
||||||
|
* file that was distributed with this source code.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace Composer\Autoload;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ClassLoader implements a PSR-0, PSR-4 and classmap class loader.
|
||||||
|
*
|
||||||
|
* $loader = new \Composer\Autoload\ClassLoader();
|
||||||
|
*
|
||||||
|
* // register classes with namespaces
|
||||||
|
* $loader->add('Symfony\Component', __DIR__.'/component');
|
||||||
|
* $loader->add('Symfony', __DIR__.'/framework');
|
||||||
|
*
|
||||||
|
* // activate the autoloader
|
||||||
|
* $loader->register();
|
||||||
|
*
|
||||||
|
* // to enable searching the include path (eg. for PEAR packages)
|
||||||
|
* $loader->setUseIncludePath(true);
|
||||||
|
*
|
||||||
|
* In this example, if you try to use a class in the Symfony\Component
|
||||||
|
* namespace or one of its children (Symfony\Component\Console for instance),
|
||||||
|
* the autoloader will first look for the class under the component/
|
||||||
|
* directory, and it will then fallback to the framework/ directory if not
|
||||||
|
* found before giving up.
|
||||||
|
*
|
||||||
|
* This class is loosely based on the Symfony UniversalClassLoader.
|
||||||
|
*
|
||||||
|
* @author Fabien Potencier <fabien@symfony.com>
|
||||||
|
* @author Jordi Boggiano <j.boggiano@seld.be>
|
||||||
|
* @see http://www.php-fig.org/psr/psr-0/
|
||||||
|
* @see http://www.php-fig.org/psr/psr-4/
|
||||||
|
*/
|
||||||
|
class ClassLoader
|
||||||
|
{
|
||||||
|
// PSR-4
|
||||||
|
private $prefixLengthsPsr4 = array();
|
||||||
|
private $prefixDirsPsr4 = array();
|
||||||
|
private $fallbackDirsPsr4 = array();
|
||||||
|
|
||||||
|
// PSR-0
|
||||||
|
private $prefixesPsr0 = array();
|
||||||
|
private $fallbackDirsPsr0 = array();
|
||||||
|
|
||||||
|
private $useIncludePath = false;
|
||||||
|
private $classMap = array();
|
||||||
|
private $classMapAuthoritative = false;
|
||||||
|
private $missingClasses = array();
|
||||||
|
private $apcuPrefix;
|
||||||
|
|
||||||
|
public function getPrefixes()
|
||||||
|
{
|
||||||
|
if (!empty($this->prefixesPsr0)) {
|
||||||
|
return call_user_func_array('array_merge', array_values($this->prefixesPsr0));
|
||||||
|
}
|
||||||
|
|
||||||
|
return array();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getPrefixesPsr4()
|
||||||
|
{
|
||||||
|
return $this->prefixDirsPsr4;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getFallbackDirs()
|
||||||
|
{
|
||||||
|
return $this->fallbackDirsPsr0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getFallbackDirsPsr4()
|
||||||
|
{
|
||||||
|
return $this->fallbackDirsPsr4;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getClassMap()
|
||||||
|
{
|
||||||
|
return $this->classMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param array $classMap Class to filename map
|
||||||
|
*/
|
||||||
|
public function addClassMap(array $classMap)
|
||||||
|
{
|
||||||
|
if ($this->classMap) {
|
||||||
|
$this->classMap = array_merge($this->classMap, $classMap);
|
||||||
|
} else {
|
||||||
|
$this->classMap = $classMap;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Registers a set of PSR-0 directories for a given prefix, either
|
||||||
|
* appending or prepending to the ones previously set for this prefix.
|
||||||
|
*
|
||||||
|
* @param string $prefix The prefix
|
||||||
|
* @param array|string $paths The PSR-0 root directories
|
||||||
|
* @param bool $prepend Whether to prepend the directories
|
||||||
|
*/
|
||||||
|
public function add($prefix, $paths, $prepend = false)
|
||||||
|
{
|
||||||
|
if (!$prefix) {
|
||||||
|
if ($prepend) {
|
||||||
|
$this->fallbackDirsPsr0 = array_merge(
|
||||||
|
(array) $paths,
|
||||||
|
$this->fallbackDirsPsr0
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
$this->fallbackDirsPsr0 = array_merge(
|
||||||
|
$this->fallbackDirsPsr0,
|
||||||
|
(array) $paths
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
$first = $prefix[0];
|
||||||
|
if (!isset($this->prefixesPsr0[$first][$prefix])) {
|
||||||
|
$this->prefixesPsr0[$first][$prefix] = (array) $paths;
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if ($prepend) {
|
||||||
|
$this->prefixesPsr0[$first][$prefix] = array_merge(
|
||||||
|
(array) $paths,
|
||||||
|
$this->prefixesPsr0[$first][$prefix]
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
$this->prefixesPsr0[$first][$prefix] = array_merge(
|
||||||
|
$this->prefixesPsr0[$first][$prefix],
|
||||||
|
(array) $paths
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Registers a set of PSR-4 directories for a given namespace, either
|
||||||
|
* appending or prepending to the ones previously set for this namespace.
|
||||||
|
*
|
||||||
|
* @param string $prefix The prefix/namespace, with trailing '\\'
|
||||||
|
* @param array|string $paths The PSR-4 base directories
|
||||||
|
* @param bool $prepend Whether to prepend the directories
|
||||||
|
*
|
||||||
|
* @throws \InvalidArgumentException
|
||||||
|
*/
|
||||||
|
public function addPsr4($prefix, $paths, $prepend = false)
|
||||||
|
{
|
||||||
|
if (!$prefix) {
|
||||||
|
// Register directories for the root namespace.
|
||||||
|
if ($prepend) {
|
||||||
|
$this->fallbackDirsPsr4 = array_merge(
|
||||||
|
(array) $paths,
|
||||||
|
$this->fallbackDirsPsr4
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
$this->fallbackDirsPsr4 = array_merge(
|
||||||
|
$this->fallbackDirsPsr4,
|
||||||
|
(array) $paths
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} elseif (!isset($this->prefixDirsPsr4[$prefix])) {
|
||||||
|
// Register directories for a new namespace.
|
||||||
|
$length = strlen($prefix);
|
||||||
|
if ('\\' !== $prefix[$length - 1]) {
|
||||||
|
throw new \InvalidArgumentException("A non-empty PSR-4 prefix must end with a namespace separator.");
|
||||||
|
}
|
||||||
|
$this->prefixLengthsPsr4[$prefix[0]][$prefix] = $length;
|
||||||
|
$this->prefixDirsPsr4[$prefix] = (array) $paths;
|
||||||
|
} elseif ($prepend) {
|
||||||
|
// Prepend directories for an already registered namespace.
|
||||||
|
$this->prefixDirsPsr4[$prefix] = array_merge(
|
||||||
|
(array) $paths,
|
||||||
|
$this->prefixDirsPsr4[$prefix]
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
// Append directories for an already registered namespace.
|
||||||
|
$this->prefixDirsPsr4[$prefix] = array_merge(
|
||||||
|
$this->prefixDirsPsr4[$prefix],
|
||||||
|
(array) $paths
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Registers a set of PSR-0 directories for a given prefix,
|
||||||
|
* replacing any others previously set for this prefix.
|
||||||
|
*
|
||||||
|
* @param string $prefix The prefix
|
||||||
|
* @param array|string $paths The PSR-0 base directories
|
||||||
|
*/
|
||||||
|
public function set($prefix, $paths)
|
||||||
|
{
|
||||||
|
if (!$prefix) {
|
||||||
|
$this->fallbackDirsPsr0 = (array) $paths;
|
||||||
|
} else {
|
||||||
|
$this->prefixesPsr0[$prefix[0]][$prefix] = (array) $paths;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Registers a set of PSR-4 directories for a given namespace,
|
||||||
|
* replacing any others previously set for this namespace.
|
||||||
|
*
|
||||||
|
* @param string $prefix The prefix/namespace, with trailing '\\'
|
||||||
|
* @param array|string $paths The PSR-4 base directories
|
||||||
|
*
|
||||||
|
* @throws \InvalidArgumentException
|
||||||
|
*/
|
||||||
|
public function setPsr4($prefix, $paths)
|
||||||
|
{
|
||||||
|
if (!$prefix) {
|
||||||
|
$this->fallbackDirsPsr4 = (array) $paths;
|
||||||
|
} else {
|
||||||
|
$length = strlen($prefix);
|
||||||
|
if ('\\' !== $prefix[$length - 1]) {
|
||||||
|
throw new \InvalidArgumentException("A non-empty PSR-4 prefix must end with a namespace separator.");
|
||||||
|
}
|
||||||
|
$this->prefixLengthsPsr4[$prefix[0]][$prefix] = $length;
|
||||||
|
$this->prefixDirsPsr4[$prefix] = (array) $paths;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Turns on searching the include path for class files.
|
||||||
|
*
|
||||||
|
* @param bool $useIncludePath
|
||||||
|
*/
|
||||||
|
public function setUseIncludePath($useIncludePath)
|
||||||
|
{
|
||||||
|
$this->useIncludePath = $useIncludePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Can be used to check if the autoloader uses the include path to check
|
||||||
|
* for classes.
|
||||||
|
*
|
||||||
|
* @return bool
|
||||||
|
*/
|
||||||
|
public function getUseIncludePath()
|
||||||
|
{
|
||||||
|
return $this->useIncludePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Turns off searching the prefix and fallback directories for classes
|
||||||
|
* that have not been registered with the class map.
|
||||||
|
*
|
||||||
|
* @param bool $classMapAuthoritative
|
||||||
|
*/
|
||||||
|
public function setClassMapAuthoritative($classMapAuthoritative)
|
||||||
|
{
|
||||||
|
$this->classMapAuthoritative = $classMapAuthoritative;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should class lookup fail if not found in the current class map?
|
||||||
|
*
|
||||||
|
* @return bool
|
||||||
|
*/
|
||||||
|
public function isClassMapAuthoritative()
|
||||||
|
{
|
||||||
|
return $this->classMapAuthoritative;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* APCu prefix to use to cache found/not-found classes, if the extension is enabled.
|
||||||
|
*
|
||||||
|
* @param string|null $apcuPrefix
|
||||||
|
*/
|
||||||
|
public function setApcuPrefix($apcuPrefix)
|
||||||
|
{
|
||||||
|
$this->apcuPrefix = function_exists('apcu_fetch') && filter_var(ini_get('apc.enabled'), FILTER_VALIDATE_BOOLEAN) ? $apcuPrefix : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The APCu prefix in use, or null if APCu caching is not enabled.
|
||||||
|
*
|
||||||
|
* @return string|null
|
||||||
|
*/
|
||||||
|
public function getApcuPrefix()
|
||||||
|
{
|
||||||
|
return $this->apcuPrefix;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Registers this instance as an autoloader.
|
||||||
|
*
|
||||||
|
* @param bool $prepend Whether to prepend the autoloader or not
|
||||||
|
*/
|
||||||
|
public function register($prepend = false)
|
||||||
|
{
|
||||||
|
spl_autoload_register(array($this, 'loadClass'), true, $prepend);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unregisters this instance as an autoloader.
|
||||||
|
*/
|
||||||
|
public function unregister()
|
||||||
|
{
|
||||||
|
spl_autoload_unregister(array($this, 'loadClass'));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads the given class or interface.
|
||||||
|
*
|
||||||
|
* @param string $class The name of the class
|
||||||
|
* @return bool|null True if loaded, null otherwise
|
||||||
|
*/
|
||||||
|
public function loadClass($class)
|
||||||
|
{
|
||||||
|
if ($file = $this->findFile($class)) {
|
||||||
|
includeFile($file);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds the path to the file where the class is defined.
|
||||||
|
*
|
||||||
|
* @param string $class The name of the class
|
||||||
|
*
|
||||||
|
* @return string|false The path if found, false otherwise
|
||||||
|
*/
|
||||||
|
public function findFile($class)
|
||||||
|
{
|
||||||
|
// class map lookup
|
||||||
|
if (isset($this->classMap[$class])) {
|
||||||
|
return $this->classMap[$class];
|
||||||
|
}
|
||||||
|
if ($this->classMapAuthoritative || isset($this->missingClasses[$class])) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (null !== $this->apcuPrefix) {
|
||||||
|
$file = apcu_fetch($this->apcuPrefix.$class, $hit);
|
||||||
|
if ($hit) {
|
||||||
|
return $file;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$file = $this->findFileWithExtension($class, '.php');
|
||||||
|
|
||||||
|
// Search for Hack files if we are running on HHVM
|
||||||
|
if (false === $file && defined('HHVM_VERSION')) {
|
||||||
|
$file = $this->findFileWithExtension($class, '.hh');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (null !== $this->apcuPrefix) {
|
||||||
|
apcu_add($this->apcuPrefix.$class, $file);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (false === $file) {
|
||||||
|
// Remember that this class does not exist.
|
||||||
|
$this->missingClasses[$class] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $file;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function findFileWithExtension($class, $ext)
|
||||||
|
{
|
||||||
|
// PSR-4 lookup
|
||||||
|
$logicalPathPsr4 = strtr($class, '\\', DIRECTORY_SEPARATOR) . $ext;
|
||||||
|
|
||||||
|
$first = $class[0];
|
||||||
|
if (isset($this->prefixLengthsPsr4[$first])) {
|
||||||
|
$subPath = $class;
|
||||||
|
while (false !== $lastPos = strrpos($subPath, '\\')) {
|
||||||
|
$subPath = substr($subPath, 0, $lastPos);
|
||||||
|
$search = $subPath . '\\';
|
||||||
|
if (isset($this->prefixDirsPsr4[$search])) {
|
||||||
|
$pathEnd = DIRECTORY_SEPARATOR . substr($logicalPathPsr4, $lastPos + 1);
|
||||||
|
foreach ($this->prefixDirsPsr4[$search] as $dir) {
|
||||||
|
if (file_exists($file = $dir . $pathEnd)) {
|
||||||
|
return $file;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// PSR-4 fallback dirs
|
||||||
|
foreach ($this->fallbackDirsPsr4 as $dir) {
|
||||||
|
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr4)) {
|
||||||
|
return $file;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// PSR-0 lookup
|
||||||
|
if (false !== $pos = strrpos($class, '\\')) {
|
||||||
|
// namespaced class name
|
||||||
|
$logicalPathPsr0 = substr($logicalPathPsr4, 0, $pos + 1)
|
||||||
|
. strtr(substr($logicalPathPsr4, $pos + 1), '_', DIRECTORY_SEPARATOR);
|
||||||
|
} else {
|
||||||
|
// PEAR-like class name
|
||||||
|
$logicalPathPsr0 = strtr($class, '_', DIRECTORY_SEPARATOR) . $ext;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isset($this->prefixesPsr0[$first])) {
|
||||||
|
foreach ($this->prefixesPsr0[$first] as $prefix => $dirs) {
|
||||||
|
if (0 === strpos($class, $prefix)) {
|
||||||
|
foreach ($dirs as $dir) {
|
||||||
|
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr0)) {
|
||||||
|
return $file;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// PSR-0 fallback dirs
|
||||||
|
foreach ($this->fallbackDirsPsr0 as $dir) {
|
||||||
|
if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr0)) {
|
||||||
|
return $file;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// PSR-0 include paths.
|
||||||
|
if ($this->useIncludePath && $file = stream_resolve_include_path($logicalPathPsr0)) {
|
||||||
|
return $file;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Scope isolated include.
|
||||||
|
*
|
||||||
|
* Prevents access to $this/self from included files.
|
||||||
|
*/
|
||||||
|
function includeFile($file)
|
||||||
|
{
|
||||||
|
include $file;
|
||||||
|
}
|
|
@ -0,0 +1,21 @@
|
||||||
|
|
||||||
|
Copyright (c) Nils Adermann, Jordi Boggiano
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is furnished
|
||||||
|
to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
// autoload_classmap.php @generated by Composer
|
||||||
|
|
||||||
|
$vendorDir = dirname(dirname(__FILE__));
|
||||||
|
$baseDir = dirname($vendorDir);
|
||||||
|
|
||||||
|
return array(
|
||||||
|
);
|
|
@ -0,0 +1,9 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
// autoload_namespaces.php @generated by Composer
|
||||||
|
|
||||||
|
$vendorDir = dirname(dirname(__FILE__));
|
||||||
|
$baseDir = dirname($vendorDir);
|
||||||
|
|
||||||
|
return array(
|
||||||
|
);
|
|
@ -0,0 +1,10 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
// autoload_psr4.php @generated by Composer
|
||||||
|
|
||||||
|
$vendorDir = dirname(dirname(__FILE__));
|
||||||
|
$baseDir = dirname($vendorDir);
|
||||||
|
|
||||||
|
return array(
|
||||||
|
'thiagoalessio\\TesseractOCR\\' => array($vendorDir . '/thiagoalessio/tesseract_ocr/src'),
|
||||||
|
);
|
|
@ -0,0 +1,55 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
// autoload_real.php @generated by Composer
|
||||||
|
|
||||||
|
class ComposerAutoloaderInit695d781792f754383aa61632167d066e
|
||||||
|
{
|
||||||
|
private static $loader;
|
||||||
|
|
||||||
|
public static function loadClassLoader($class)
|
||||||
|
{
|
||||||
|
if ('Composer\Autoload\ClassLoader' === $class) {
|
||||||
|
require __DIR__ . '/ClassLoader.php';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return \Composer\Autoload\ClassLoader
|
||||||
|
*/
|
||||||
|
public static function getLoader()
|
||||||
|
{
|
||||||
|
if (null !== self::$loader) {
|
||||||
|
return self::$loader;
|
||||||
|
}
|
||||||
|
|
||||||
|
spl_autoload_register(array('ComposerAutoloaderInit695d781792f754383aa61632167d066e', 'loadClassLoader'), true, true);
|
||||||
|
self::$loader = $loader = new \Composer\Autoload\ClassLoader();
|
||||||
|
spl_autoload_unregister(array('ComposerAutoloaderInit695d781792f754383aa61632167d066e', 'loadClassLoader'));
|
||||||
|
|
||||||
|
$useStaticLoader = PHP_VERSION_ID >= 50600 && !defined('HHVM_VERSION') && (!function_exists('zend_loader_file_encoded') || !zend_loader_file_encoded());
|
||||||
|
if ($useStaticLoader) {
|
||||||
|
require_once __DIR__ . '/autoload_static.php';
|
||||||
|
|
||||||
|
call_user_func(\Composer\Autoload\ComposerStaticInit695d781792f754383aa61632167d066e::getInitializer($loader));
|
||||||
|
} else {
|
||||||
|
$map = require __DIR__ . '/autoload_namespaces.php';
|
||||||
|
foreach ($map as $namespace => $path) {
|
||||||
|
$loader->set($namespace, $path);
|
||||||
|
}
|
||||||
|
|
||||||
|
$map = require __DIR__ . '/autoload_psr4.php';
|
||||||
|
foreach ($map as $namespace => $path) {
|
||||||
|
$loader->setPsr4($namespace, $path);
|
||||||
|
}
|
||||||
|
|
||||||
|
$classMap = require __DIR__ . '/autoload_classmap.php';
|
||||||
|
if ($classMap) {
|
||||||
|
$loader->addClassMap($classMap);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$loader->register(true);
|
||||||
|
|
||||||
|
return $loader;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,31 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
// autoload_static.php @generated by Composer
|
||||||
|
|
||||||
|
namespace Composer\Autoload;
|
||||||
|
|
||||||
|
class ComposerStaticInit695d781792f754383aa61632167d066e
|
||||||
|
{
|
||||||
|
public static $prefixLengthsPsr4 = array (
|
||||||
|
't' =>
|
||||||
|
array (
|
||||||
|
'thiagoalessio\\TesseractOCR\\' => 27,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
public static $prefixDirsPsr4 = array (
|
||||||
|
'thiagoalessio\\TesseractOCR\\' =>
|
||||||
|
array (
|
||||||
|
0 => __DIR__ . '/..' . '/thiagoalessio/tesseract_ocr/src',
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
public static function getInitializer(ClassLoader $loader)
|
||||||
|
{
|
||||||
|
return \Closure::bind(function () use ($loader) {
|
||||||
|
$loader->prefixLengthsPsr4 = ComposerStaticInit695d781792f754383aa61632167d066e::$prefixLengthsPsr4;
|
||||||
|
$loader->prefixDirsPsr4 = ComposerStaticInit695d781792f754383aa61632167d066e::$prefixDirsPsr4;
|
||||||
|
|
||||||
|
}, null, ClassLoader::class);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,48 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"name": "thiagoalessio/tesseract_ocr",
|
||||||
|
"version": "2.13.0",
|
||||||
|
"version_normalized": "2.13.0.0",
|
||||||
|
"source": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git",
|
||||||
|
"reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1"
|
||||||
|
},
|
||||||
|
"dist": {
|
||||||
|
"type": "zip",
|
||||||
|
"url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
|
||||||
|
"reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
|
||||||
|
"shasum": ""
|
||||||
|
},
|
||||||
|
"require": {
|
||||||
|
"php": "^5.3 || ^7.0 || ^8.0"
|
||||||
|
},
|
||||||
|
"require-dev": {
|
||||||
|
"phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
|
||||||
|
},
|
||||||
|
"time": "2023-10-05T21:14:48+00:00",
|
||||||
|
"type": "library",
|
||||||
|
"installation-source": "dist",
|
||||||
|
"autoload": {
|
||||||
|
"psr-4": {
|
||||||
|
"thiagoalessio\\TesseractOCR\\": "src/"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"notification-url": "https://packagist.org/downloads/",
|
||||||
|
"license": [
|
||||||
|
"MIT"
|
||||||
|
],
|
||||||
|
"authors": [
|
||||||
|
{
|
||||||
|
"name": "thiagoalessio",
|
||||||
|
"email": "thiagoalessio@me.com"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "A wrapper to work with Tesseract OCR inside PHP.",
|
||||||
|
"keywords": [
|
||||||
|
"OCR",
|
||||||
|
"Tesseract",
|
||||||
|
"text recognition"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,14 @@
|
||||||
|
---
|
||||||
|
build: false
|
||||||
|
|
||||||
|
install:
|
||||||
|
- ps: Set-Service wuauserv -StartupType Manual
|
||||||
|
- choco install php
|
||||||
|
- choco install capture2text --version 3.9
|
||||||
|
- choco install composer
|
||||||
|
- refreshenv
|
||||||
|
- cd %APPVEYOR_BUILD_FOLDER%
|
||||||
|
- composer install
|
||||||
|
|
||||||
|
test_script:
|
||||||
|
- php tests\run.php unit e2e
|
|
@ -0,0 +1,19 @@
|
||||||
|
Copyright (c) 2012-2021 Thiago Alessio Pereira
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
|
@ -0,0 +1,508 @@
|
||||||
|
<img src="https://thiagoalessio.github.io/tesseract-ocr-for-php/images/logo.png" alt="Tesseract OCR for PHP" align="right" width="320px"/>
|
||||||
|
|
||||||
|
# Tesseract OCR for PHP
|
||||||
|
|
||||||
|
A wrapper to work with Tesseract OCR inside PHP.
|
||||||
|
|
||||||
|
[![CI][ci_badge]][ci]
|
||||||
|
[![AppVeyor][appveyor_badge]][appveyor]
|
||||||
|
[![Codacy][codacy_badge]][codacy]
|
||||||
|
[![Test Coverage][test_coverage_badge]][test_coverage]
|
||||||
|
<br/>
|
||||||
|
[![Latest Stable Version][stable_version_badge]][packagist]
|
||||||
|
[![Total Downloads][total_downloads_badge]][packagist]
|
||||||
|
[![Monthly Downloads][monthly_downloads_badge]][packagist]
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
Via [Composer][]:
|
||||||
|
|
||||||
|
$ composer require thiagoalessio/tesseract_ocr
|
||||||
|
|
||||||
|
:bangbang: **This library depends on [Tesseract OCR][], version _3.02_ or later.**
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
### ![][windows_icon] Note for Windows users
|
||||||
|
|
||||||
|
There are [many ways][tesseract_installation_on_windows] to install
|
||||||
|
[Tesseract OCR][] on your system, but if you just want something quick to
|
||||||
|
get up and running, I recommend installing the [Capture2Text][] package with
|
||||||
|
[Chocolatey][].
|
||||||
|
|
||||||
|
choco install capture2text --version 3.9
|
||||||
|
|
||||||
|
:warning: Recent versions of [Capture2Text][] stopped shipping the `tesseract` binary.
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
### ![][macos_icon] Note for macOS users
|
||||||
|
|
||||||
|
With [MacPorts][] you can install support for individual languages, like so:
|
||||||
|
|
||||||
|
$ sudo port install tesseract-<langcode>
|
||||||
|
|
||||||
|
But that is not possible with [Homebrew][]. It comes only with **English** support
|
||||||
|
by default, so if you intend to use it for other language, the quickest solution
|
||||||
|
is to install them all:
|
||||||
|
|
||||||
|
$ brew install tesseract tesseract-lang
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Basic usage
|
||||||
|
|
||||||
|
<img align="right" width="50%" title="The quick brown fox jumps over the lazy dog." src="./tests/EndToEnd/images/text.png"/>
|
||||||
|
|
||||||
|
```php
|
||||||
|
use thiagoalessio\TesseractOCR\TesseractOCR;
|
||||||
|
echo (new TesseractOCR('text.png'))
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
The quick brown fox
|
||||||
|
jumps over
|
||||||
|
the lazy dog.
|
||||||
|
```
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
### Other languages
|
||||||
|
|
||||||
|
<img align="right" width="50%" title="Bülowstraße" src="./tests/EndToEnd/images/german.png"/>
|
||||||
|
|
||||||
|
```php
|
||||||
|
use thiagoalessio\TesseractOCR\TesseractOCR;
|
||||||
|
echo (new TesseractOCR('german.png'))
|
||||||
|
->lang('deu')
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
Bülowstraße
|
||||||
|
```
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
### Multiple languages
|
||||||
|
|
||||||
|
<img align="right" width="50%" title="I eat すし y Pollo" src="./tests/EndToEnd/images/mixed-languages.png"/>
|
||||||
|
|
||||||
|
```php
|
||||||
|
use thiagoalessio\TesseractOCR\TesseractOCR;
|
||||||
|
echo (new TesseractOCR('mixed-languages.png'))
|
||||||
|
->lang('eng', 'jpn', 'spa')
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
I eat すし y Pollo
|
||||||
|
```
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
### Inducing recognition
|
||||||
|
|
||||||
|
<img align="right" width="50%" title="8055" src="./tests/EndToEnd/images/8055.png"/>
|
||||||
|
|
||||||
|
```php
|
||||||
|
use thiagoalessio\TesseractOCR\TesseractOCR;
|
||||||
|
echo (new TesseractOCR('8055.png'))
|
||||||
|
->allowlist(range('A', 'Z'))
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
BOSS
|
||||||
|
```
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
### Breaking CAPTCHAs
|
||||||
|
|
||||||
|
Yes, I know some of you might want to use this library for the *noble* purpose
|
||||||
|
of breaking CAPTCHAs, so please take a look at this comment:
|
||||||
|
|
||||||
|
<https://github.com/thiagoalessio/tesseract-ocr-for-php/issues/91#issuecomment-342290510>
|
||||||
|
|
||||||
|
## API
|
||||||
|
|
||||||
|
### run
|
||||||
|
|
||||||
|
Executes a `tesseract` command, optionally receiving an integer as `timeout`,
|
||||||
|
in case you experience stalled tesseract processes.
|
||||||
|
|
||||||
|
```php
|
||||||
|
$ocr = new TesseractOCR();
|
||||||
|
$ocr->run();
|
||||||
|
```
|
||||||
|
```php
|
||||||
|
$ocr = new TesseractOCR();
|
||||||
|
$timeout = 500;
|
||||||
|
$ocr->run($timeout);
|
||||||
|
```
|
||||||
|
|
||||||
|
### image
|
||||||
|
|
||||||
|
Define the path of an image to be recognized by `tesseract`.
|
||||||
|
|
||||||
|
```php
|
||||||
|
$ocr = new TesseractOCR();
|
||||||
|
$ocr->image('/path/to/image.png');
|
||||||
|
$ocr->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### imageData
|
||||||
|
|
||||||
|
Set the image to be recognized by `tesseract` from a string, with its size.
|
||||||
|
This can be useful when dealing with files that are already loaded in memory.
|
||||||
|
You can easily retrieve the image data and size of an image object :
|
||||||
|
```php
|
||||||
|
//Using Imagick
|
||||||
|
$data = $img->getImageBlob();
|
||||||
|
$size = $img->getImageLength();
|
||||||
|
//Using GD
|
||||||
|
ob_start();
|
||||||
|
// Note that you can use any format supported by tesseract
|
||||||
|
imagepng($img, null, 0);
|
||||||
|
$size = ob_get_length();
|
||||||
|
$data = ob_get_clean();
|
||||||
|
|
||||||
|
$ocr = new TesseractOCR();
|
||||||
|
$ocr->imageData($data, $size);
|
||||||
|
$ocr->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### executable
|
||||||
|
|
||||||
|
Define a custom location of the `tesseract` executable,
|
||||||
|
if by any reason it is not present in the `$PATH`.
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->executable('/path/to/tesseract')
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### version
|
||||||
|
|
||||||
|
Returns the current version of `tesseract`.
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR())->version();
|
||||||
|
```
|
||||||
|
|
||||||
|
### availableLanguages
|
||||||
|
|
||||||
|
Returns a list of available languages/scripts.
|
||||||
|
|
||||||
|
```php
|
||||||
|
foreach((new TesseractOCR())->availableLanguages() as $lang) echo $lang;
|
||||||
|
```
|
||||||
|
|
||||||
|
__More info:__ <https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages-and-scripts>
|
||||||
|
|
||||||
|
### tessdataDir
|
||||||
|
|
||||||
|
Specify a custom location for the tessdata directory.
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->tessdataDir('/path')
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### userWords
|
||||||
|
|
||||||
|
Specify the location of user words file.
|
||||||
|
|
||||||
|
This is a plain text file containing a list of words that you want to be
|
||||||
|
considered as a normal dictionary words by `tesseract`.
|
||||||
|
|
||||||
|
Useful when dealing with contents that contain technical terminology, jargon,
|
||||||
|
etc.
|
||||||
|
|
||||||
|
```
|
||||||
|
$ cat /path/to/user-words.txt
|
||||||
|
foo
|
||||||
|
bar
|
||||||
|
```
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->userWords('/path/to/user-words.txt')
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### userPatterns
|
||||||
|
|
||||||
|
Specify the location of user patterns file.
|
||||||
|
|
||||||
|
If the contents you are dealing with have known patterns, this option can help
|
||||||
|
a lot tesseract's recognition accuracy.
|
||||||
|
|
||||||
|
```
|
||||||
|
$ cat /path/to/user-patterns.txt'
|
||||||
|
1-\d\d\d-GOOG-441
|
||||||
|
www.\n\\\*.com
|
||||||
|
```
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->userPatterns('/path/to/user-patterns.txt')
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### lang
|
||||||
|
|
||||||
|
Define one or more languages to be used during the recognition.
|
||||||
|
A complete list of available languages can be found at:
|
||||||
|
<https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages>
|
||||||
|
|
||||||
|
__Tip from [@daijiale][]:__ Use the combination `->lang('chi_sim', 'chi_tra')`
|
||||||
|
for proper recognition of Chinese.
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->lang('lang1', 'lang2', 'lang3')
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### psm
|
||||||
|
|
||||||
|
Specify the Page Segmentation Method, which instructs `tesseract` how to
|
||||||
|
interpret the given image.
|
||||||
|
|
||||||
|
__More info:__ <https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality#page-segmentation-method>
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->psm(6)
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### oem
|
||||||
|
|
||||||
|
Specify the OCR Engine Mode. (see `tesseract --help-oem`)
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->oem(2)
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### dpi
|
||||||
|
|
||||||
|
Specify the image DPI. It is useful if your image does not contain this information in its metadata.
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->dpi(300)
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### allowlist
|
||||||
|
|
||||||
|
This is a shortcut for `->config('tessedit_char_whitelist', 'abcdef....')`.
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->allowlist(range('a', 'z'), range(0, 9), '-_@')
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### configFile
|
||||||
|
|
||||||
|
Specify a config file to be used. It can either be the path to your own
|
||||||
|
config file or the name of one of the predefined config files:
|
||||||
|
<https://github.com/tesseract-ocr/tesseract/tree/master/tessdata/configs>
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->configFile('hocr')
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### setOutputFile
|
||||||
|
|
||||||
|
Specify an Outputfile to be used. Be aware: If you set an outputfile then
|
||||||
|
the option `withoutTempFiles` is ignored.
|
||||||
|
Tempfiles are written (and deleted) even if `withoutTempFiles = true`.
|
||||||
|
|
||||||
|
In combination with `configFile` you are able to get the `hocr`, `tsv` or
|
||||||
|
`pdf` files.
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->configFile('pdf')
|
||||||
|
->setOutputFile('/PATH_TO_MY_OUTPUTFILE/searchable.pdf')
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### digits
|
||||||
|
|
||||||
|
Shortcut for `->configFile('digits')`.
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->digits()
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### hocr
|
||||||
|
|
||||||
|
Shortcut for `->configFile('hocr')`.
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->hocr()
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### pdf
|
||||||
|
|
||||||
|
Shortcut for `->configFile('pdf')`.
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->pdf()
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### quiet
|
||||||
|
|
||||||
|
Shortcut for `->configFile('quiet')`.
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->quiet()
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### tsv
|
||||||
|
|
||||||
|
Shortcut for `->configFile('tsv')`.
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->tsv()
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### txt
|
||||||
|
|
||||||
|
Shortcut for `->configFile('txt')`.
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->txt()
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### tempDir
|
||||||
|
|
||||||
|
Define a custom directory to store temporary files generated by tesseract.
|
||||||
|
Make sure the directory actually exists and the user running `php` is allowed
|
||||||
|
to write in there.
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->tempDir('./my/custom/temp/dir')
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### withoutTempFiles
|
||||||
|
|
||||||
|
Specify that `tesseract` should output the recognized text without writing to temporary files.
|
||||||
|
The data is gathered from the standard output of `tesseract` instead.
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->withoutTempFiles()
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
### Other options
|
||||||
|
|
||||||
|
Any configuration option offered by Tesseract can be used like that:
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->config('config_var', 'value')
|
||||||
|
->config('other_config_var', 'other value')
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
Or like that:
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->configVar('value')
|
||||||
|
->otherConfigVar('other value')
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
__More info:__ <https://github.com/tesseract-ocr/tesseract/wiki/ControlParams>
|
||||||
|
|
||||||
|
### Thread-limit
|
||||||
|
|
||||||
|
Sometimes, it may be useful to limit the number of threads that tesseract is
|
||||||
|
allowed to use (e.g. in [this case](https://github.com/tesseract-ocr/tesseract/issues/898)).
|
||||||
|
Set the maxmium number of threads as param for the `run` function:
|
||||||
|
|
||||||
|
```php
|
||||||
|
echo (new TesseractOCR('img.png'))
|
||||||
|
->threadLimit(1)
|
||||||
|
->run();
|
||||||
|
```
|
||||||
|
|
||||||
|
## How to contribute
|
||||||
|
|
||||||
|
You can contribute to this project by:
|
||||||
|
|
||||||
|
* Opening an [Issue][] if you found a bug or wish to propose a new feature;
|
||||||
|
* Placing a [Pull Request][] with code that fix a bug, missing/wrong documentation
|
||||||
|
or implement a new feature;
|
||||||
|
|
||||||
|
Just make sure you take a look at our [Code of Conduct][] and [Contributing][]
|
||||||
|
instructions.
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
tesseract-ocr-for-php is released under the [MIT License][].
|
||||||
|
|
||||||
|
|
||||||
|
<h2></h2><p align="center"><sub>Made with <sub><a href="#"><img src="https://thiagoalessio.github.io/tesseract-ocr-for-php/images/heart.svg" alt="love" width="14px"/></a></sub> in Berlin</sub></p>
|
||||||
|
|
||||||
|
[ci_badge]: https://github.com/thiagoalessio/tesseract-ocr-for-php/workflows/CI/badge.svg?event=push&branch=main
|
||||||
|
[ci]: https://github.com/thiagoalessio/tesseract-ocr-for-php/actions?query=workflow%3ACI
|
||||||
|
[appveyor_badge]: https://ci.appveyor.com/api/projects/status/xwy5ls0798iwcim3/branch/main?svg=true
|
||||||
|
[appveyor]: https://ci.appveyor.com/project/thiagoalessio/tesseract-ocr-for-php/branch/main
|
||||||
|
[codacy_badge]: https://app.codacy.com/project/badge/Grade/a81aa10012874f23a57df5b492d835f2
|
||||||
|
[codacy]: https://www.codacy.com/gh/thiagoalessio/tesseract-ocr-for-php/dashboard
|
||||||
|
[test_coverage_badge]: https://codecov.io/gh/thiagoalessio/tesseract-ocr-for-php/branch/main/graph/badge.svg?token=Y0VnrqiSIf
|
||||||
|
[test_coverage]: https://codecov.io/gh/thiagoalessio/tesseract-ocr-for-php
|
||||||
|
[stable_version_badge]: https://img.shields.io/packagist/v/thiagoalessio/tesseract_ocr.svg
|
||||||
|
[packagist]: https://packagist.org/packages/thiagoalessio/tesseract_ocr
|
||||||
|
[total_downloads_badge]: https://img.shields.io/packagist/dt/thiagoalessio/tesseract_ocr.svg
|
||||||
|
[monthly_downloads_badge]: https://img.shields.io/packagist/dm/thiagoalessio/tesseract_ocr.svg
|
||||||
|
[Tesseract OCR]: https://github.com/tesseract-ocr/tesseract
|
||||||
|
[Composer]: http://getcomposer.org/
|
||||||
|
[windows_icon]: https://thiagoalessio.github.io/tesseract-ocr-for-php/images/windows-18.svg
|
||||||
|
[macos_icon]: https://thiagoalessio.github.io/tesseract-ocr-for-php/images/apple-18.svg
|
||||||
|
[tesseract_installation_on_windows]: https://github.com/tesseract-ocr/tesseract/wiki#windows
|
||||||
|
[Capture2Text]: https://chocolatey.org/packages/capture2text
|
||||||
|
[Chocolatey]: https://chocolatey.org
|
||||||
|
[MacPorts]: https://www.macports.org
|
||||||
|
[Homebrew]: https://brew.sh
|
||||||
|
[@daijiale]: https://github.com/daijiale
|
||||||
|
[HOCR]: https://github.com/tesseract-ocr/tesseract/wiki/Command-Line-Usage#hocr-output
|
||||||
|
[TSV]: https://github.com/tesseract-ocr/tesseract/wiki/Command-Line-Usage#tsv-output-currently-available-in-305-dev-in-master-branch-on-github
|
||||||
|
[Issue]: https://github.com/thiagoalessio/tesseract-ocr-for-php/issues
|
||||||
|
[Pull Request]: https://github.com/thiagoalessio/tesseract-ocr-for-php/pulls
|
||||||
|
[Code of Conduct]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/.github/CODE_OF_CONDUCT.md
|
||||||
|
[Contributing]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/.github/CONTRIBUTING.md
|
||||||
|
[MIT License]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/MIT-LICENSE
|
|
@ -0,0 +1,4 @@
|
||||||
|
fixes:
|
||||||
|
- "/home/runner/work/tesseract-ocr-for-php/tesseract-ocr-for-php/::"
|
||||||
|
- "/Users/runner/work/tesseract-ocr-for-php/tesseract-ocr-for-php/::"
|
||||||
|
- "C:\\projects\\tesseract-ocr-for-php\\::"
|
|
@ -0,0 +1,35 @@
|
||||||
|
{
|
||||||
|
"name": "thiagoalessio/tesseract_ocr",
|
||||||
|
"description": "A wrapper to work with Tesseract OCR inside PHP.",
|
||||||
|
"version": "2.13.0",
|
||||||
|
"type": "library",
|
||||||
|
"keywords": ["Tesseract", "OCR", "text recognition"],
|
||||||
|
"license": "MIT",
|
||||||
|
"authors": [
|
||||||
|
{
|
||||||
|
"name": "thiagoalessio",
|
||||||
|
"email": "thiagoalessio@me.com"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"support": {
|
||||||
|
"issues": "https://github.com/thiagoalessio/tesseract-ocr-for-php/issues",
|
||||||
|
"irc": "irc://irc.freenode.net/tesseract-ocr-for-php",
|
||||||
|
"source": "https://github.com/thiagoalessio/tesseract-ocr-for-php"
|
||||||
|
},
|
||||||
|
"require": {
|
||||||
|
"php": "^5.3 || ^7.0 || ^8.0"
|
||||||
|
},
|
||||||
|
"require-dev": {
|
||||||
|
"phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
|
||||||
|
},
|
||||||
|
"autoload": {
|
||||||
|
"psr-4": {
|
||||||
|
"thiagoalessio\\TesseractOCR\\": "src/"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"autoload-dev": {
|
||||||
|
"psr-4": {
|
||||||
|
"thiagoalessio\\TesseractOCR\\Tests\\": "tests/"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,80 @@
|
||||||
|
<?php namespace thiagoalessio\TesseractOCR;
|
||||||
|
|
||||||
|
class Command
|
||||||
|
{
|
||||||
|
public $executable = 'tesseract';
|
||||||
|
public $useFileAsInput = true;
|
||||||
|
public $useFileAsOutput = true;
|
||||||
|
public $options = array();
|
||||||
|
public $configFile;
|
||||||
|
public $tempDir;
|
||||||
|
public $threadLimit;
|
||||||
|
public $image;
|
||||||
|
public $imageSize;
|
||||||
|
private $outputFile;
|
||||||
|
|
||||||
|
public function __construct($image=null, $outputFile=null)
|
||||||
|
{
|
||||||
|
$this->image = $image;
|
||||||
|
$this->outputFile = $outputFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function build() { return "$this"; }
|
||||||
|
|
||||||
|
public function __toString()
|
||||||
|
{
|
||||||
|
$cmd = array();
|
||||||
|
if ($this->threadLimit) $cmd[] = "OMP_THREAD_LIMIT={$this->threadLimit}";
|
||||||
|
$cmd[] = self::escape($this->executable);
|
||||||
|
$cmd[] = $this->useFileAsInput ? self::escape($this->image) : "-";
|
||||||
|
$cmd[] = $this->useFileAsOutput ? self::escape($this->getOutputFile(false)) : "-";
|
||||||
|
|
||||||
|
$version = $this->getTesseractVersion();
|
||||||
|
|
||||||
|
foreach ($this->options as $option) {
|
||||||
|
$cmd[] = is_callable($option) ? $option($version) : "$option";
|
||||||
|
}
|
||||||
|
if ($this->configFile) $cmd[] = $this->configFile;
|
||||||
|
|
||||||
|
return join(' ', $cmd);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getOutputFile($withExt=true)
|
||||||
|
{
|
||||||
|
if (!$this->outputFile)
|
||||||
|
$this->outputFile = $this->getTempDir()
|
||||||
|
.DIRECTORY_SEPARATOR
|
||||||
|
.basename(tempnam($this->getTempDir(), 'ocr'));
|
||||||
|
if (!$withExt) return $this->outputFile;
|
||||||
|
|
||||||
|
$hasCustomExt = array('hocr', 'tsv', 'pdf');
|
||||||
|
$ext = in_array($this->configFile, $hasCustomExt) ? $this->configFile : 'txt';
|
||||||
|
return "{$this->outputFile}.{$ext}";
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getTempDir()
|
||||||
|
{
|
||||||
|
return $this->tempDir ?: sys_get_temp_dir();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getTesseractVersion()
|
||||||
|
{
|
||||||
|
exec(self::escape($this->executable).' --version 2>&1', $output);
|
||||||
|
$outputParts = explode(' ', $output[0]);
|
||||||
|
return $outputParts[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getAvailableLanguages()
|
||||||
|
{
|
||||||
|
exec(self::escape($this->executable) . ' --list-langs 2>&1', $output);
|
||||||
|
array_shift($output);
|
||||||
|
sort($output);
|
||||||
|
return $output;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function escape($str)
|
||||||
|
{
|
||||||
|
$charlist = strtoupper(substr(PHP_OS, 0, 3)) == 'WIN' ? '$"`' : '$"\\`';
|
||||||
|
return '"'.addcslashes($str, $charlist).'"';
|
||||||
|
}
|
||||||
|
}
|
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/FeatureNotAvailableException.php
vendored
Normal file
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/FeatureNotAvailableException.php
vendored
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace thiagoalessio\TesseractOCR;
|
||||||
|
|
||||||
|
class FeatureNotAvailableException extends TesseractOcrException
|
||||||
|
{
|
||||||
|
}
|
|
@ -0,0 +1,120 @@
|
||||||
|
<?php namespace thiagoalessio\TesseractOCR;
|
||||||
|
|
||||||
|
class FriendlyErrors
|
||||||
|
{
|
||||||
|
public static function checkImagePath($image)
|
||||||
|
{
|
||||||
|
if (file_exists($image)) return;
|
||||||
|
|
||||||
|
$currentDir = __DIR__;
|
||||||
|
$msg = array();
|
||||||
|
$msg[] = "Error! The image \"$image\" was not found.";
|
||||||
|
$msg[] = '';
|
||||||
|
$msg[] = "The current __DIR__ is $currentDir";
|
||||||
|
$msg = join(PHP_EOL, $msg);
|
||||||
|
|
||||||
|
throw new ImageNotFoundException($msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function checkTesseractPresence($executable)
|
||||||
|
{
|
||||||
|
if (file_exists($executable)) return;
|
||||||
|
|
||||||
|
$cmd = stripos(PHP_OS, 'win') === 0
|
||||||
|
? 'where.exe '.Command::escape($executable).' > NUL 2>&1'
|
||||||
|
: 'type '.Command::escape($executable).' > /dev/null 2>&1';
|
||||||
|
system($cmd, $exitCode);
|
||||||
|
|
||||||
|
if ($exitCode == 0) return;
|
||||||
|
|
||||||
|
$currentPath = getenv('PATH');
|
||||||
|
$msg = array();
|
||||||
|
$msg[] = "Error! The command \"$executable\" was not found.";
|
||||||
|
$msg[] = '';
|
||||||
|
$msg[] = 'Make sure you have Tesseract OCR installed on your system:';
|
||||||
|
$msg[] = 'https://github.com/tesseract-ocr/tesseract';
|
||||||
|
$msg[] = '';
|
||||||
|
$msg[] = "The current \$PATH is $currentPath";
|
||||||
|
$msg = join(PHP_EOL, $msg);
|
||||||
|
|
||||||
|
throw new TesseractNotFoundException($msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function checkCommandExecution($command, $stdout, $stderr)
|
||||||
|
{
|
||||||
|
if ($command->useFileAsOutput) {
|
||||||
|
$file = $command->getOutputFile();
|
||||||
|
if (file_exists($file) && filesize($file) > 0) return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!$command->useFileAsOutput && $stdout) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
$msg = array();
|
||||||
|
$msg[] = 'Error! The command did not produce any output.';
|
||||||
|
$msg[] = '';
|
||||||
|
$msg[] = 'Generated command:';
|
||||||
|
$msg[] = "$command";
|
||||||
|
$msg[] = '';
|
||||||
|
$msg[] = 'Returned message:';
|
||||||
|
$arrayStderr = explode(PHP_EOL, $stderr);
|
||||||
|
array_pop($arrayStderr);
|
||||||
|
$msg = array_merge($msg, $arrayStderr);
|
||||||
|
$msg = join(PHP_EOL, $msg);
|
||||||
|
|
||||||
|
throw new UnsuccessfulCommandException($msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function checkProcessCreation($processHandle, $command)
|
||||||
|
{
|
||||||
|
if ($processHandle !== FALSE) return;
|
||||||
|
|
||||||
|
$msg = array();
|
||||||
|
$msg[] = 'Error! The command could not be launched.';
|
||||||
|
$msg[] = '';
|
||||||
|
$msg[] = 'Generated command:';
|
||||||
|
$msg[] = "$command";
|
||||||
|
$msg = join(PHP_EOL, $msg);
|
||||||
|
|
||||||
|
throw new UnsuccessfulCommandException($msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function checkTesseractVersion($expected, $action, $command)
|
||||||
|
{
|
||||||
|
$actual = $command->getTesseractVersion();
|
||||||
|
|
||||||
|
if ($actual[0] === 'v')
|
||||||
|
$actual = substr($actual, 1);
|
||||||
|
|
||||||
|
if (version_compare($actual, $expected, ">=")) return;
|
||||||
|
|
||||||
|
$msg = array();
|
||||||
|
$msg[] = "Error! $action is not available this tesseract version";
|
||||||
|
$msg[] = "Required version is $expected, actual version is $actual";
|
||||||
|
$msg[] = '';
|
||||||
|
$msg[] = 'Generated command:';
|
||||||
|
$msg[] = "$command";
|
||||||
|
$msg = join(PHP_EOL, $msg);
|
||||||
|
|
||||||
|
throw new FeatureNotAvailableException($msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function checkWritePermissions($path)
|
||||||
|
{
|
||||||
|
if (!is_dir(dirname($path))) mkdir(dirname($path));
|
||||||
|
$writableDirectory = is_writable(dirname($path));
|
||||||
|
$writableFile = true;
|
||||||
|
if (file_exists($path)) $writableFile = is_writable($path);
|
||||||
|
if ($writableFile && $writableDirectory) return;
|
||||||
|
|
||||||
|
$msg = array();
|
||||||
|
$msg[] = "Error! No permission to write to $path";
|
||||||
|
$msg[] = "Make sure you have the right outputFile and permissions "
|
||||||
|
."to write to the folder";
|
||||||
|
$msg[] = '';
|
||||||
|
$msg = join(PHP_EOL, $msg);
|
||||||
|
|
||||||
|
throw new NoWritePermissionsForOutputFile($msg);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,7 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace thiagoalessio\TesseractOCR;
|
||||||
|
|
||||||
|
class ImageNotFoundException extends TesseractOcrException
|
||||||
|
{
|
||||||
|
}
|
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/NoWritePermissionsForOutputFile.php
vendored
Normal file
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/NoWritePermissionsForOutputFile.php
vendored
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace thiagoalessio\TesseractOCR;
|
||||||
|
|
||||||
|
class NoWritePermissionsForOutputFile extends TesseractOcrException
|
||||||
|
{
|
||||||
|
}
|
|
@ -0,0 +1,79 @@
|
||||||
|
<?php namespace thiagoalessio\TesseractOCR;
|
||||||
|
|
||||||
|
class Option
|
||||||
|
{
|
||||||
|
public static function psm($psm)
|
||||||
|
{
|
||||||
|
return function($version) use ($psm) {
|
||||||
|
$version = preg_replace('/^v/', '', $version);
|
||||||
|
return (version_compare($version, 4, '>=') ? '-' : '')."-psm $psm";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function oem($oem)
|
||||||
|
{
|
||||||
|
return function($version) use ($oem) {
|
||||||
|
Option::checkMinVersion('3.05', $version, 'oem');
|
||||||
|
return "--oem $oem";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function dpi($dpi)
|
||||||
|
{
|
||||||
|
return function() use ($dpi) {
|
||||||
|
return "--dpi $dpi";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function userWords($path)
|
||||||
|
{
|
||||||
|
return function($version) use ($path) {
|
||||||
|
Option::checkMinVersion('3.04', $version, 'user-words');
|
||||||
|
return '--user-words "'.addcslashes($path, '\\"').'"';
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function userPatterns($path)
|
||||||
|
{
|
||||||
|
return function($version) use ($path) {
|
||||||
|
Option::checkMinVersion('3.04', $version, 'user-patterns');
|
||||||
|
return '--user-patterns "'.addcslashes($path, '\\"').'"';
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function tessdataDir($path)
|
||||||
|
{
|
||||||
|
return function() use ($path) {
|
||||||
|
return '--tessdata-dir "'.addcslashes($path, '\\"').'"';
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function lang()
|
||||||
|
{
|
||||||
|
$languages = func_get_args();
|
||||||
|
return function() use ($languages) {
|
||||||
|
return '-l '.join('+', $languages);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function config($var, $value)
|
||||||
|
{
|
||||||
|
return function() use($var, $value) {
|
||||||
|
$snakeCase = function($str) {
|
||||||
|
return strtolower(preg_replace('/([A-Z])+/', '_$1', $str));
|
||||||
|
};
|
||||||
|
$pair = $snakeCase($var).'='.$value;
|
||||||
|
return '-c "'.addcslashes($pair, '\\"').'"';
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function checkMinVersion($minVersion, $currVersion, $option)
|
||||||
|
{
|
||||||
|
$minVersion = preg_replace('/^v/', '', $minVersion);
|
||||||
|
$currVersion = preg_replace('/^v/', '', $currVersion);
|
||||||
|
if (!version_compare($currVersion, $minVersion, '<')) return;
|
||||||
|
$msg = "$option option is only available on Tesseract $minVersion or later.";
|
||||||
|
$msg.= PHP_EOL."Your version of Tesseract is $currVersion";
|
||||||
|
throw new \Exception($msg);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,83 @@
|
||||||
|
<?php namespace thiagoalessio\TesseractOCR;
|
||||||
|
|
||||||
|
class Process {
|
||||||
|
|
||||||
|
private $stdin;
|
||||||
|
private $stdout;
|
||||||
|
private $stderr;
|
||||||
|
private $handle;
|
||||||
|
private $startTime;
|
||||||
|
|
||||||
|
public function __construct($command)
|
||||||
|
{
|
||||||
|
$this->startTime = microtime(true);
|
||||||
|
$streamDescriptors = [
|
||||||
|
array("pipe", "r"),
|
||||||
|
array("pipe", "w"),
|
||||||
|
array("pipe", "w")
|
||||||
|
];
|
||||||
|
$this->handle = proc_open($command, $streamDescriptors, $pipes, NULL, NULL, ["bypass_shell" => true]);
|
||||||
|
list($this->stdin, $this->stdout, $this->stderr) = $pipes;
|
||||||
|
|
||||||
|
FriendlyErrors::checkProcessCreation($this->handle, $command);
|
||||||
|
|
||||||
|
//This is can avoid deadlock on some cases (when stderr buffer is filled up before writing to stdout and vice-versa)
|
||||||
|
stream_set_blocking($this->stdout, 0);
|
||||||
|
stream_set_blocking($this->stderr, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function write($data, $len)
|
||||||
|
{
|
||||||
|
$total = 0;
|
||||||
|
do
|
||||||
|
{
|
||||||
|
$res = fwrite($this->stdin, substr($data, $total));
|
||||||
|
} while($res && $total += $res < $len);
|
||||||
|
return $total === $len;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public function wait($timeout = 0)
|
||||||
|
{
|
||||||
|
$running = true;
|
||||||
|
$data = ["out" => "", "err" => ""];
|
||||||
|
while (($running === true) && !$this->hasTimedOut($timeout))
|
||||||
|
{
|
||||||
|
$data["out"] .= fread($this->stdout, 8192);
|
||||||
|
$data["err"] .= fread($this->stderr, 8192);
|
||||||
|
$procInfo = proc_get_status($this->handle);
|
||||||
|
$running = $procInfo["running"];
|
||||||
|
if ($running) {
|
||||||
|
usleep(1000); // Sleep 1ms to yield CPU time
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return $data;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function close()
|
||||||
|
{
|
||||||
|
$this->closeStream($this->stdin);
|
||||||
|
$this->closeStream($this->stdout);
|
||||||
|
$this->closeStream($this->stderr);
|
||||||
|
return proc_close($this->handle);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function closeStdin()
|
||||||
|
{
|
||||||
|
$this->closeStream($this->stdin);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function hasTimedOut($timeout)
|
||||||
|
{
|
||||||
|
return (($timeout > 0) && ($this->startTime + $timeout < microtime(true)));
|
||||||
|
}
|
||||||
|
|
||||||
|
private function closeStream(&$stream)
|
||||||
|
{
|
||||||
|
if ($stream !== NULL)
|
||||||
|
{
|
||||||
|
fclose($stream);
|
||||||
|
$stream = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractNotFoundException.php
vendored
Normal file
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/TesseractNotFoundException.php
vendored
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace thiagoalessio\TesseractOCR;
|
||||||
|
|
||||||
|
class TesseractNotFoundException extends TesseractOcrException
|
||||||
|
{
|
||||||
|
}
|
|
@ -0,0 +1,181 @@
|
||||||
|
<?php namespace thiagoalessio\TesseractOCR;
|
||||||
|
|
||||||
|
use thiagoalessio\TesseractOCR\Command;
|
||||||
|
use thiagoalessio\TesseractOCR\Option;
|
||||||
|
use thiagoalessio\TesseractOCR\FriendlyErrors;
|
||||||
|
|
||||||
|
class TesseractOCR
|
||||||
|
{
|
||||||
|
public $command;
|
||||||
|
private $outputFile = null;
|
||||||
|
|
||||||
|
public function __construct($image=null, $command=null)
|
||||||
|
{
|
||||||
|
$this->command = $command ?: new Command;
|
||||||
|
$this->image("$image");
|
||||||
|
}
|
||||||
|
|
||||||
|
public function run($timeout = 0)
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
if ($this->outputFile !== null) {
|
||||||
|
FriendlyErrors::checkWritePermissions($this->outputFile);
|
||||||
|
$this->command->useFileAsOutput = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
FriendlyErrors::checkTesseractPresence($this->command->executable);
|
||||||
|
if ($this->command->useFileAsInput) {
|
||||||
|
FriendlyErrors::checkImagePath($this->command->image);
|
||||||
|
}
|
||||||
|
|
||||||
|
$process = new Process("{$this->command}");
|
||||||
|
|
||||||
|
if (!$this->command->useFileAsInput) {
|
||||||
|
$process->write($this->command->image, $this->command->imageSize);
|
||||||
|
$process->closeStdin();
|
||||||
|
}
|
||||||
|
$output = $process->wait($timeout);
|
||||||
|
|
||||||
|
FriendlyErrors::checkCommandExecution($this->command, $output["out"], $output["err"]);
|
||||||
|
}
|
||||||
|
catch (TesseractOcrException $e) {
|
||||||
|
if ($this->command->useFileAsOutput) $this->cleanTempFiles();
|
||||||
|
throw $e;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($this->command->useFileAsOutput) {
|
||||||
|
$text = file_get_contents($this->command->getOutputFile());
|
||||||
|
|
||||||
|
if ($this->outputFile !== null) {
|
||||||
|
rename($this->command->getOutputFile(), $this->outputFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->cleanTempFiles();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
$text = $output["out"];
|
||||||
|
|
||||||
|
return trim($text, " \t\n\r\0\x0A\x0B\x0C");
|
||||||
|
}
|
||||||
|
|
||||||
|
public function imageData($image, $size)
|
||||||
|
{
|
||||||
|
FriendlyErrors::checkTesseractVersion("3.03-rc1", "Reading image data from stdin", $this->command);
|
||||||
|
$this->command->useFileAsInput = false;
|
||||||
|
$this->command->image = $image;
|
||||||
|
$this->command->imageSize = $size;
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function withoutTempFiles()
|
||||||
|
{
|
||||||
|
FriendlyErrors::checkTesseractVersion("3.03-rc1", "Writing to stdout (without using temp files)", $this->command);
|
||||||
|
$this->command->useFileAsOutput = false;
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function image($image)
|
||||||
|
{
|
||||||
|
$this->command->image = $image;
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function executable($executable)
|
||||||
|
{
|
||||||
|
FriendlyErrors::checkTesseractPresence($executable);
|
||||||
|
$this->command->executable = $executable;
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function configFile($configFile)
|
||||||
|
{
|
||||||
|
$this->command->configFile = $configFile;
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function tempDir($tempDir)
|
||||||
|
{
|
||||||
|
$this->command->tempDir = $tempDir;
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function threadLimit($limit)
|
||||||
|
{
|
||||||
|
$this->command->threadLimit = $limit;
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// @deprecated
|
||||||
|
public function format($fmt) { return $this->configFile($fmt); }
|
||||||
|
|
||||||
|
public function setOutputFile($path) {
|
||||||
|
$this->outputFile = $path;
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function allowlist()
|
||||||
|
{
|
||||||
|
$concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; };
|
||||||
|
$allowlist = join('', array_map($concat, func_get_args()));
|
||||||
|
$this->command->options[] = Option::config('tessedit_char_whitelist', $allowlist);
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function whitelist()
|
||||||
|
{
|
||||||
|
$warningMsg = 'Notice: whitelist is deprecated, use allowlist instead.';
|
||||||
|
trigger_error($warningMsg, E_USER_NOTICE);
|
||||||
|
|
||||||
|
$concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; };
|
||||||
|
$allowlist = join('', array_map($concat, func_get_args()));
|
||||||
|
return $this->allowlist($allowlist);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function version()
|
||||||
|
{
|
||||||
|
return $this->command->getTesseractVersion();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function availableLanguages()
|
||||||
|
{
|
||||||
|
return $this->command->getAvailableLanguages();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function __call($method, $args)
|
||||||
|
{
|
||||||
|
if ($this->isConfigFile($method)) return $this->configFile($method);
|
||||||
|
if ($this->isOption($method)) {
|
||||||
|
$option = $this->getOptionClassName().'::'.$method;
|
||||||
|
$this->command->options[] = call_user_func_array($option, $args);
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
$arg = empty($args) ? null : $args[0];
|
||||||
|
$this->command->options[] = Option::config($method, $arg);
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function isConfigFile($name)
|
||||||
|
{
|
||||||
|
return in_array($name, array('digits', 'hocr', 'pdf', 'quiet', 'tsv', 'txt'));
|
||||||
|
}
|
||||||
|
|
||||||
|
private function isOption($name)
|
||||||
|
{
|
||||||
|
return in_array($name, get_class_methods($this->getOptionClassName()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private function getOptionClassName()
|
||||||
|
{
|
||||||
|
return __NAMESPACE__.'\\Option';
|
||||||
|
}
|
||||||
|
|
||||||
|
private function cleanTempFiles()
|
||||||
|
{
|
||||||
|
if (file_exists($this->command->getOutputFile(false))) {
|
||||||
|
unlink($this->command->getOutputFile(false));
|
||||||
|
}
|
||||||
|
if (file_exists($this->command->getOutputFile(true))) {
|
||||||
|
unlink($this->command->getOutputFile(true));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,7 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace thiagoalessio\TesseractOCR;
|
||||||
|
|
||||||
|
abstract class TesseractOcrException extends \Exception
|
||||||
|
{
|
||||||
|
}
|
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/UnsuccessfulCommandException.php
vendored
Normal file
7
tesseract/vendor/thiagoalessio/tesseract_ocr/src/UnsuccessfulCommandException.php
vendored
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace thiagoalessio\TesseractOCR;
|
||||||
|
|
||||||
|
class UnsuccessfulCommandException extends TesseractOcrException
|
||||||
|
{
|
||||||
|
}
|
Loading…
Reference in New Issue