New addon "tesseract" for OCR

This commit is contained in:
Michael 2024-01-14 19:21:08 +00:00 committed by Hypolite Petovan
parent a179bab747
commit 4dd903b473
28 changed files with 1904 additions and 0 deletions

View file

@ -0,0 +1,80 @@
<?php namespace thiagoalessio\TesseractOCR;
class Command
{
public $executable = 'tesseract';
public $useFileAsInput = true;
public $useFileAsOutput = true;
public $options = array();
public $configFile;
public $tempDir;
public $threadLimit;
public $image;
public $imageSize;
private $outputFile;
public function __construct($image=null, $outputFile=null)
{
$this->image = $image;
$this->outputFile = $outputFile;
}
public function build() { return "$this"; }
public function __toString()
{
$cmd = array();
if ($this->threadLimit) $cmd[] = "OMP_THREAD_LIMIT={$this->threadLimit}";
$cmd[] = self::escape($this->executable);
$cmd[] = $this->useFileAsInput ? self::escape($this->image) : "-";
$cmd[] = $this->useFileAsOutput ? self::escape($this->getOutputFile(false)) : "-";
$version = $this->getTesseractVersion();
foreach ($this->options as $option) {
$cmd[] = is_callable($option) ? $option($version) : "$option";
}
if ($this->configFile) $cmd[] = $this->configFile;
return join(' ', $cmd);
}
public function getOutputFile($withExt=true)
{
if (!$this->outputFile)
$this->outputFile = $this->getTempDir()
.DIRECTORY_SEPARATOR
.basename(tempnam($this->getTempDir(), 'ocr'));
if (!$withExt) return $this->outputFile;
$hasCustomExt = array('hocr', 'tsv', 'pdf');
$ext = in_array($this->configFile, $hasCustomExt) ? $this->configFile : 'txt';
return "{$this->outputFile}.{$ext}";
}
public function getTempDir()
{
return $this->tempDir ?: sys_get_temp_dir();
}
public function getTesseractVersion()
{
exec(self::escape($this->executable).' --version 2>&1', $output);
$outputParts = explode(' ', $output[0]);
return $outputParts[1];
}
public function getAvailableLanguages()
{
exec(self::escape($this->executable) . ' --list-langs 2>&1', $output);
array_shift($output);
sort($output);
return $output;
}
public static function escape($str)
{
$charlist = strtoupper(substr(PHP_OS, 0, 3)) == 'WIN' ? '$"`' : '$"\\`';
return '"'.addcslashes($str, $charlist).'"';
}
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
class FeatureNotAvailableException extends TesseractOcrException
{
}

View file

@ -0,0 +1,120 @@
<?php namespace thiagoalessio\TesseractOCR;
class FriendlyErrors
{
public static function checkImagePath($image)
{
if (file_exists($image)) return;
$currentDir = __DIR__;
$msg = array();
$msg[] = "Error! The image \"$image\" was not found.";
$msg[] = '';
$msg[] = "The current __DIR__ is $currentDir";
$msg = join(PHP_EOL, $msg);
throw new ImageNotFoundException($msg);
}
public static function checkTesseractPresence($executable)
{
if (file_exists($executable)) return;
$cmd = stripos(PHP_OS, 'win') === 0
? 'where.exe '.Command::escape($executable).' > NUL 2>&1'
: 'type '.Command::escape($executable).' > /dev/null 2>&1';
system($cmd, $exitCode);
if ($exitCode == 0) return;
$currentPath = getenv('PATH');
$msg = array();
$msg[] = "Error! The command \"$executable\" was not found.";
$msg[] = '';
$msg[] = 'Make sure you have Tesseract OCR installed on your system:';
$msg[] = 'https://github.com/tesseract-ocr/tesseract';
$msg[] = '';
$msg[] = "The current \$PATH is $currentPath";
$msg = join(PHP_EOL, $msg);
throw new TesseractNotFoundException($msg);
}
public static function checkCommandExecution($command, $stdout, $stderr)
{
if ($command->useFileAsOutput) {
$file = $command->getOutputFile();
if (file_exists($file) && filesize($file) > 0) return;
}
if (!$command->useFileAsOutput && $stdout) {
return;
}
$msg = array();
$msg[] = 'Error! The command did not produce any output.';
$msg[] = '';
$msg[] = 'Generated command:';
$msg[] = "$command";
$msg[] = '';
$msg[] = 'Returned message:';
$arrayStderr = explode(PHP_EOL, $stderr);
array_pop($arrayStderr);
$msg = array_merge($msg, $arrayStderr);
$msg = join(PHP_EOL, $msg);
throw new UnsuccessfulCommandException($msg);
}
public static function checkProcessCreation($processHandle, $command)
{
if ($processHandle !== FALSE) return;
$msg = array();
$msg[] = 'Error! The command could not be launched.';
$msg[] = '';
$msg[] = 'Generated command:';
$msg[] = "$command";
$msg = join(PHP_EOL, $msg);
throw new UnsuccessfulCommandException($msg);
}
public static function checkTesseractVersion($expected, $action, $command)
{
$actual = $command->getTesseractVersion();
if ($actual[0] === 'v')
$actual = substr($actual, 1);
if (version_compare($actual, $expected, ">=")) return;
$msg = array();
$msg[] = "Error! $action is not available this tesseract version";
$msg[] = "Required version is $expected, actual version is $actual";
$msg[] = '';
$msg[] = 'Generated command:';
$msg[] = "$command";
$msg = join(PHP_EOL, $msg);
throw new FeatureNotAvailableException($msg);
}
public static function checkWritePermissions($path)
{
if (!is_dir(dirname($path))) mkdir(dirname($path));
$writableDirectory = is_writable(dirname($path));
$writableFile = true;
if (file_exists($path)) $writableFile = is_writable($path);
if ($writableFile && $writableDirectory) return;
$msg = array();
$msg[] = "Error! No permission to write to $path";
$msg[] = "Make sure you have the right outputFile and permissions "
."to write to the folder";
$msg[] = '';
$msg = join(PHP_EOL, $msg);
throw new NoWritePermissionsForOutputFile($msg);
}
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
class ImageNotFoundException extends TesseractOcrException
{
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
class NoWritePermissionsForOutputFile extends TesseractOcrException
{
}

View file

@ -0,0 +1,79 @@
<?php namespace thiagoalessio\TesseractOCR;
class Option
{
public static function psm($psm)
{
return function($version) use ($psm) {
$version = preg_replace('/^v/', '', $version);
return (version_compare($version, 4, '>=') ? '-' : '')."-psm $psm";
};
}
public static function oem($oem)
{
return function($version) use ($oem) {
Option::checkMinVersion('3.05', $version, 'oem');
return "--oem $oem";
};
}
public static function dpi($dpi)
{
return function() use ($dpi) {
return "--dpi $dpi";
};
}
public static function userWords($path)
{
return function($version) use ($path) {
Option::checkMinVersion('3.04', $version, 'user-words');
return '--user-words "'.addcslashes($path, '\\"').'"';
};
}
public static function userPatterns($path)
{
return function($version) use ($path) {
Option::checkMinVersion('3.04', $version, 'user-patterns');
return '--user-patterns "'.addcslashes($path, '\\"').'"';
};
}
public static function tessdataDir($path)
{
return function() use ($path) {
return '--tessdata-dir "'.addcslashes($path, '\\"').'"';
};
}
public static function lang()
{
$languages = func_get_args();
return function() use ($languages) {
return '-l '.join('+', $languages);
};
}
public static function config($var, $value)
{
return function() use($var, $value) {
$snakeCase = function($str) {
return strtolower(preg_replace('/([A-Z])+/', '_$1', $str));
};
$pair = $snakeCase($var).'='.$value;
return '-c "'.addcslashes($pair, '\\"').'"';
};
}
public static function checkMinVersion($minVersion, $currVersion, $option)
{
$minVersion = preg_replace('/^v/', '', $minVersion);
$currVersion = preg_replace('/^v/', '', $currVersion);
if (!version_compare($currVersion, $minVersion, '<')) return;
$msg = "$option option is only available on Tesseract $minVersion or later.";
$msg.= PHP_EOL."Your version of Tesseract is $currVersion";
throw new \Exception($msg);
}
}

View file

@ -0,0 +1,83 @@
<?php namespace thiagoalessio\TesseractOCR;
class Process {
private $stdin;
private $stdout;
private $stderr;
private $handle;
private $startTime;
public function __construct($command)
{
$this->startTime = microtime(true);
$streamDescriptors = [
array("pipe", "r"),
array("pipe", "w"),
array("pipe", "w")
];
$this->handle = proc_open($command, $streamDescriptors, $pipes, NULL, NULL, ["bypass_shell" => true]);
list($this->stdin, $this->stdout, $this->stderr) = $pipes;
FriendlyErrors::checkProcessCreation($this->handle, $command);
//This is can avoid deadlock on some cases (when stderr buffer is filled up before writing to stdout and vice-versa)
stream_set_blocking($this->stdout, 0);
stream_set_blocking($this->stderr, 0);
}
public function write($data, $len)
{
$total = 0;
do
{
$res = fwrite($this->stdin, substr($data, $total));
} while($res && $total += $res < $len);
return $total === $len;
}
public function wait($timeout = 0)
{
$running = true;
$data = ["out" => "", "err" => ""];
while (($running === true) && !$this->hasTimedOut($timeout))
{
$data["out"] .= fread($this->stdout, 8192);
$data["err"] .= fread($this->stderr, 8192);
$procInfo = proc_get_status($this->handle);
$running = $procInfo["running"];
if ($running) {
usleep(1000); // Sleep 1ms to yield CPU time
}
}
return $data;
}
public function close()
{
$this->closeStream($this->stdin);
$this->closeStream($this->stdout);
$this->closeStream($this->stderr);
return proc_close($this->handle);
}
public function closeStdin()
{
$this->closeStream($this->stdin);
}
private function hasTimedOut($timeout)
{
return (($timeout > 0) && ($this->startTime + $timeout < microtime(true)));
}
private function closeStream(&$stream)
{
if ($stream !== NULL)
{
fclose($stream);
$stream = NULL;
}
}
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
class TesseractNotFoundException extends TesseractOcrException
{
}

View file

@ -0,0 +1,181 @@
<?php namespace thiagoalessio\TesseractOCR;
use thiagoalessio\TesseractOCR\Command;
use thiagoalessio\TesseractOCR\Option;
use thiagoalessio\TesseractOCR\FriendlyErrors;
class TesseractOCR
{
public $command;
private $outputFile = null;
public function __construct($image=null, $command=null)
{
$this->command = $command ?: new Command;
$this->image("$image");
}
public function run($timeout = 0)
{
try {
if ($this->outputFile !== null) {
FriendlyErrors::checkWritePermissions($this->outputFile);
$this->command->useFileAsOutput = true;
}
FriendlyErrors::checkTesseractPresence($this->command->executable);
if ($this->command->useFileAsInput) {
FriendlyErrors::checkImagePath($this->command->image);
}
$process = new Process("{$this->command}");
if (!$this->command->useFileAsInput) {
$process->write($this->command->image, $this->command->imageSize);
$process->closeStdin();
}
$output = $process->wait($timeout);
FriendlyErrors::checkCommandExecution($this->command, $output["out"], $output["err"]);
}
catch (TesseractOcrException $e) {
if ($this->command->useFileAsOutput) $this->cleanTempFiles();
throw $e;
}
if ($this->command->useFileAsOutput) {
$text = file_get_contents($this->command->getOutputFile());
if ($this->outputFile !== null) {
rename($this->command->getOutputFile(), $this->outputFile);
}
$this->cleanTempFiles();
}
else
$text = $output["out"];
return trim($text, " \t\n\r\0\x0A\x0B\x0C");
}
public function imageData($image, $size)
{
FriendlyErrors::checkTesseractVersion("3.03-rc1", "Reading image data from stdin", $this->command);
$this->command->useFileAsInput = false;
$this->command->image = $image;
$this->command->imageSize = $size;
return $this;
}
public function withoutTempFiles()
{
FriendlyErrors::checkTesseractVersion("3.03-rc1", "Writing to stdout (without using temp files)", $this->command);
$this->command->useFileAsOutput = false;
return $this;
}
public function image($image)
{
$this->command->image = $image;
return $this;
}
public function executable($executable)
{
FriendlyErrors::checkTesseractPresence($executable);
$this->command->executable = $executable;
return $this;
}
public function configFile($configFile)
{
$this->command->configFile = $configFile;
return $this;
}
public function tempDir($tempDir)
{
$this->command->tempDir = $tempDir;
return $this;
}
public function threadLimit($limit)
{
$this->command->threadLimit = $limit;
return $this;
}
// @deprecated
public function format($fmt) { return $this->configFile($fmt); }
public function setOutputFile($path) {
$this->outputFile = $path;
return $this;
}
public function allowlist()
{
$concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; };
$allowlist = join('', array_map($concat, func_get_args()));
$this->command->options[] = Option::config('tessedit_char_whitelist', $allowlist);
return $this;
}
public function whitelist()
{
$warningMsg = 'Notice: whitelist is deprecated, use allowlist instead.';
trigger_error($warningMsg, E_USER_NOTICE);
$concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; };
$allowlist = join('', array_map($concat, func_get_args()));
return $this->allowlist($allowlist);
}
public function version()
{
return $this->command->getTesseractVersion();
}
public function availableLanguages()
{
return $this->command->getAvailableLanguages();
}
public function __call($method, $args)
{
if ($this->isConfigFile($method)) return $this->configFile($method);
if ($this->isOption($method)) {
$option = $this->getOptionClassName().'::'.$method;
$this->command->options[] = call_user_func_array($option, $args);
return $this;
}
$arg = empty($args) ? null : $args[0];
$this->command->options[] = Option::config($method, $arg);
return $this;
}
private function isConfigFile($name)
{
return in_array($name, array('digits', 'hocr', 'pdf', 'quiet', 'tsv', 'txt'));
}
private function isOption($name)
{
return in_array($name, get_class_methods($this->getOptionClassName()));
}
private function getOptionClassName()
{
return __NAMESPACE__.'\\Option';
}
private function cleanTempFiles()
{
if (file_exists($this->command->getOutputFile(false))) {
unlink($this->command->getOutputFile(false));
}
if (file_exists($this->command->getOutputFile(true))) {
unlink($this->command->getOutputFile(true));
}
}
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
abstract class TesseractOcrException extends \Exception
{
}

View file

@ -0,0 +1,7 @@
<?php
namespace thiagoalessio\TesseractOCR;
class UnsuccessfulCommandException extends TesseractOcrException
{
}