From 16224a7001abd7d5e826227731a92c1ef8ce255f Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 5 Oct 2020 12:50:18 +0000 Subject: [PATCH] Improve plaintext generation for language detection --- src/Content/Text/BBCode.php | 13 +++++++++++++ src/Model/Item.php | 12 +++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/Content/Text/BBCode.php b/src/Content/Text/BBCode.php index 5b22746ce6..1b0fa9c740 100644 --- a/src/Content/Text/BBCode.php +++ b/src/Content/Text/BBCode.php @@ -1220,6 +1220,19 @@ class BBCode return $return; } + public static function removeLinks(string $bbcode) + { + $bbcode = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", ' ', $bbcode); + $bbcode = preg_replace("/\[img\=(.*?)\](.*?)\[\/img\]/ism", ' $1 ', $bbcode); + $bbcode = preg_replace("/\[img\](.*?)\[\/img\]/ism", ' ', $bbcode); + + $bbcode = preg_replace('/([@!#])\[url\=(.*?)\](.*?)\[\/url\]/ism', '', $bbcode); + $bbcode = preg_replace("/\[url\](.*?)\[\/url\]/ism", ' ', $bbcode); + $bbcode = preg_replace("/\[url=[^\[\]]*\](.*)\[\/url\]/Usi", ' $1 ', $bbcode); + $bbcode = preg_replace("/\[url\](.*?)\[\/url\]/ism", ' ', $bbcode); + return $bbcode; + } + /** * Converts a BBCode message to HTML message * diff --git a/src/Model/Item.php b/src/Model/Item.php index dfea296815..d53933ba78 100644 --- a/src/Model/Item.php +++ b/src/Model/Item.php @@ -2476,7 +2476,17 @@ class Item return ''; } - $naked_body = BBCode::toPlaintext($item['body'], false); + // Convert attachments to links + $naked_body = BBCode::removeAttachment($item['body']); + + // Remove links and pictures + $naked_body = BBCode::removeLinks($naked_body); + + // Convert the title and the body to plain text + $naked_body = trim($item['title'] . "\n" . BBCode::toPlaintext($naked_body)); + + // Remove possibly remaining links + $naked_body = preg_replace(Strings::autoLinkRegEx(), '', $naked_body); $ld = new Language(); $languages = $ld->detect($naked_body)->limit(0, 3)->close();