From 37285a20190ad5e871c5e141d574a4eb80c466a8 Mon Sep 17 00:00:00 2001 From: loma-one Date: Sat, 3 May 2025 13:26:23 +0200 Subject: [PATCH] tesseract/tesseract.php aktualisiert Modified addon. The addon used to generate a large system load and led to the server being unavailable. The changes help to make better use of system resources. - Creates and removes tesseract-limited.sh to limit the system resources for tesseract (timeout/resource limits). - Checks permitted formats to avoid wasting resources. --- tesseract/tesseract.php | 91 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 6 deletions(-) diff --git a/tesseract/tesseract.php b/tesseract/tesseract.php index b3e1feb6..62ff4d18 100644 --- a/tesseract/tesseract.php +++ b/tesseract/tesseract.php @@ -1,9 +1,10 @@ + * Modified by: Matthias Ebers */ use Friendica\Core\Hook; @@ -13,25 +14,103 @@ use thiagoalessio\TesseractOCR\TesseractOCR; require_once __DIR__ . DIRECTORY_SEPARATOR . 'vendor' . DIRECTORY_SEPARATOR . 'autoload.php'; +/** + * Is called up when the add-on is activated + */ function tesseract_install() { Hook::register('ocr-detection', __FILE__, 'tesseract_ocr_detection'); - Logger::notice('installed tesseract'); + $wrapperPath = __DIR__ . '/tesseract-limited.sh'; + + // Create wrapper script with timeout and CPU/I/O priority + if (!file_exists($wrapperPath)) { + $script = << $wrapperPath]); + } else { + Logger::info('Tesseract wrapper script already exists', ['path' => $wrapperPath]); + } + + Logger::notice('Tesseract OCR addon installed'); } +/** + * Is called up when the add-on is deactivated + */ +function tesseract_uninstall() +{ + $wrapperPath = __DIR__ . '/tesseract-limited.sh'; + + if (file_exists($wrapperPath)) { + unlink($wrapperPath); + Logger::notice('Tesseract wrapper script removed', ['path' => $wrapperPath]); + } + + Hook::unregister('ocr-detection', __FILE__, 'tesseract_ocr_detection'); + Logger::notice('Tesseract OCR addon uninstalled'); +} + +/** + * Main function for OCR recognition + */ function tesseract_ocr_detection(&$media) { + // ➤ Alt text available? → Skip OCR + if (!empty($media['description'])) { + Logger::debug('Image already has description, skipping OCR'); + return; + } + + // Format check: Only process certain image types + $allowedTypes = ['image/jpeg', 'image/png', 'image/bmp', 'image/tiff']; + if (!empty($media['type']) && !in_array($media['type'], $allowedTypes)) { + Logger::debug('Unsupported image type for OCR', ['type' => $media['type']]); + return; + } + + // Alternatively: Check file extension (if MIME type is missing) + if (empty($media['type']) && !empty($media['filename']) && preg_match('/\.gif$/i', $media['filename'])) { + Logger::debug('GIF image detected via filename, skipping OCR'); + return; + } + $ocr = new TesseractOCR(); + try { + // Bash wrapper with resource limit + $wrapperPath = __DIR__ . '/tesseract-limited.sh'; + $ocr->executable($wrapperPath); + + // Load all available languages $languages = $ocr->availableLanguages(); if ($languages) { $ocr->lang(implode('+', $languages)); } + + // Set temporary directory $ocr->tempDir(System::getTempPath()); + + // Set image data $ocr->imageData($media['img_str'], strlen($media['img_str'])); - $media['description'] = $ocr->run(); + + // Start OCR + $text = trim($ocr->run()); + + if (!empty($text)) { + $media['description'] = $text; + Logger::debug('OCR text detected', ['text' => $text]); + } else { + Logger::debug('No text detected in image'); + } } catch (\Throwable $th) { Logger::info('Error calling TesseractOCR', ['message' => $th->getMessage()]); - } -} + } +} \ No newline at end of file