mirror of
https://git.friendi.ca/friendica/friendica-addons.git
synced 2025-07-15 12:58:49 +00:00
[retriever] First submission of retriever plugin
This commit is contained in:
parent
f573907bd2
commit
d23c7c1147
8 changed files with 1457 additions and 0 deletions
42
retriever/database.sql
Normal file
42
retriever/database.sql
Normal file
|
@ -0,0 +1,42 @@
|
|||
CREATE TABLE IF NOT EXISTS `retriever_rule` (
|
||||
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
|
||||
`uid` int(11) NOT NULL,
|
||||
`contact-id` int(11) NOT NULL,
|
||||
`data` mediumtext NULL DEFAULT NULL,
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `uid` (`uid`),
|
||||
KEY `contact-id` (`contact-id`)
|
||||
) DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS `retriever_item` (
|
||||
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
|
||||
`item-uri` varchar(800) CHARACTER SET ascii COLLATE ascii_bin NOT NULL,
|
||||
`item-uid` int(10) unsigned NOT NULL DEFAULT '0',
|
||||
`contact-id` int(10) unsigned NOT NULL DEFAULT '0',
|
||||
`resource` int(11) NOT NULL,
|
||||
`finished` tinyint(1) unsigned NOT NULL DEFAULT '0',
|
||||
KEY `resource` (`resource`),
|
||||
KEY `finished` (`finished`),
|
||||
KEY `item-uid` (`item-uid`),
|
||||
KEY `all` (`item-uri`, `item-uid`, `contact-id`),
|
||||
PRIMARY KEY (`id`)
|
||||
) DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS `retriever_resource` (
|
||||
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
|
||||
`item-uid` int(10) unsigned NOT NULL DEFAULT '0',
|
||||
`contact-id` int(10) unsigned NOT NULL DEFAULT '0',
|
||||
`type` char(255) NULL DEFAULT NULL,
|
||||
`binary` int(1) NOT NULL DEFAULT 0,
|
||||
`url` varchar(800) CHARACTER SET ascii COLLATE ascii_bin NOT NULL,
|
||||
`created` timestamp NOT NULL DEFAULT now(),
|
||||
`completed` timestamp NULL DEFAULT NULL,
|
||||
`last-try` timestamp NULL DEFAULT NULL,
|
||||
`num-tries` int(11) NOT NULL DEFAULT 0,
|
||||
`data` mediumblob NULL DEFAULT NULL,
|
||||
`http-code` smallint(1) unsigned NULL DEFAULT NULL,
|
||||
`redirect-url` varchar(800) CHARACTER SET ascii COLLATE ascii_bin NULL DEFAULT NULL,
|
||||
KEY `url` (`url`),
|
||||
KEY `completed` (`completed`),
|
||||
PRIMARY KEY (`id`)
|
||||
) DEFAULT CHARSET=utf8 COLLATE=utf8_bin
|
1028
retriever/retriever.php
Normal file
1028
retriever/retriever.php
Normal file
File diff suppressed because it is too large
Load diff
4
retriever/templates/admin.tpl
Normal file
4
retriever/templates/admin.tpl
Normal file
|
@ -0,0 +1,4 @@
|
|||
{{include file="field_input.tpl" field=$downloads_per_cron}}
|
||||
{{include file="field_checkbox.tpl" field=$allow_images}}
|
||||
<div class="submit"><input type="submit" name="page_site" value="{{$submit}}"></div>
|
||||
|
24
retriever/templates/extract.tpl
Normal file
24
retriever/templates/extract.tpl
Normal file
|
@ -0,0 +1,24 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
|
||||
<xsl:output method="html" indent="yes" version="4.0"/>
|
||||
|
||||
<xsl:template match="text()"/>
|
||||
{{function clause_xpath}}{{if !$clause.attribute}}{{$clause.element}}{{elseif $clause.attribute == 'class'}}{{$clause.element}}[contains(concat(' ', normalize-space(@class), ' '), '{{$clause.value}}')]{{else}}{{$clause.element}}[@{{$clause.attribute}}='{{$clause.value}}']{{/if}}{{/function}}
|
||||
{{foreach $spec.include as $clause}}
|
||||
|
||||
<xsl:template match="{{clause_xpath clause=$clause}}">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="node()|@*" mode="remove"/>
|
||||
</xsl:copy>
|
||||
</xsl:template>{{/foreach}}
|
||||
{{foreach $spec.exclude as $clause}}
|
||||
|
||||
<xsl:template match="{{clause_xpath clause=$clause}}" mode="remove"/>{{/foreach}}
|
||||
|
||||
<xsl:template match="node()|@*" mode="remove">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="node()|@*" mode="remove"/>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
26
retriever/templates/fix-urls.tpl
Normal file
26
retriever/templates/fix-urls.tpl
Normal file
|
@ -0,0 +1,26 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
|
||||
<!-- attempt to replace relative URLs with absolute URLs -->
|
||||
<!-- http://stackoverflow.com/questions/3824631/replace-href-value-in-anchor-tags-of-html-using-xslt -->
|
||||
|
||||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
|
||||
<xsl:output method="html" indent="yes" version="4.0"/>
|
||||
|
||||
<xsl:template match="node()|@*">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="node()|@*"/>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="*/@src[starts-with(.,'.')]">
|
||||
<xsl:attribute name="src">
|
||||
<xsl:value-of select="concat('{{$dirurl}}',.)"/>
|
||||
</xsl:attribute>
|
||||
</xsl:template>
|
||||
<xsl:template match="*/@src[starts-with(.,'/')]">
|
||||
<xsl:attribute name="src">
|
||||
<xsl:value-of select="concat('{{$rooturl}}',.)"/>
|
||||
</xsl:attribute>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
163
retriever/templates/help.tpl
Normal file
163
retriever/templates/help.tpl
Normal file
|
@ -0,0 +1,163 @@
|
|||
<h2>Retriever Plugin Help</h2>
|
||||
<p>
|
||||
This plugin replaces the short excerpts you normally get in RSS feeds
|
||||
with the full content of the article from the source website. You
|
||||
specify which part of the page you're interested in with a set of
|
||||
rules. When each item arrives, the plugin downloads the full page
|
||||
from the website, extracts content using the rules, and replaces the
|
||||
original article.
|
||||
</p>
|
||||
<p>
|
||||
There's a few reasons you may want to do this. The source website
|
||||
might be slow or overloaded. The source website might be
|
||||
untrustworthy, in which case using Friendica to scrub the HTML is a
|
||||
good idea. You might be on a LAN that blacklists certain websites.
|
||||
It also works neatly with the mailstream plugin, allowing you to read
|
||||
a news stream comfortably without needing continuous Internet
|
||||
connectivity.
|
||||
</p>
|
||||
<p>
|
||||
However, setting up retriever can be quite tricky since it depends on
|
||||
the internal design of the website. That was designed to make life
|
||||
easy for the website's developers, not for you. You'll need to have
|
||||
some familiarity with HTML, and be willing to adapt when the website
|
||||
suddenly changes everything without notice.
|
||||
</p>
|
||||
<h3>Configuring Retriever for a feed</h3>
|
||||
<p>
|
||||
To set up retriever for an RSS feed, go to the "Contacts" page and
|
||||
find your feed. Then click on the drop-down menu on the contact.
|
||||
Select "Retriever" to get to the retriever configuration.
|
||||
</p>
|
||||
<p>
|
||||
The "Include" configuration section specifies parts of the page to
|
||||
include in the article. Each row has three components:
|
||||
</p>
|
||||
<ul>
|
||||
<li>An HTML tag (e.g. "div", "span", "p")</li>
|
||||
<li>An attribute (usually "class" or "id")</li>
|
||||
<li>A value for the attribute</li>
|
||||
</ul>
|
||||
<p>
|
||||
A simple case is when the article is wrapped in a "div" element:
|
||||
</p>
|
||||
<pre>
|
||||
...
|
||||
<div class="ArticleWrapper">
|
||||
<h2>Man Bites Dog</h2>
|
||||
<img src="mbd.jpg">
|
||||
<p>
|
||||
Residents of the sleepy community of Nowheresville were
|
||||
shocked yesterday by the sight of creepy local weirdo Jim
|
||||
McOddman assaulting innocent local dog Snufflekins with his
|
||||
false teeth.
|
||||
</p>
|
||||
...
|
||||
</div>
|
||||
...
|
||||
</pre>
|
||||
<p>
|
||||
You then specify the tag "div", attribute "class", and value
|
||||
"ArticleWrapper". Everything else in the page, such as navigation
|
||||
panels and menus and footers and so on, will be discarded. If there
|
||||
is more than one section of the page you want to include, specify each
|
||||
one on a separate row. If the matching section contains some sections
|
||||
you want to remove, specify those in the "Exclude" section in the same
|
||||
way.
|
||||
</p>
|
||||
<p>
|
||||
Once you've got a configuration that you think will work, you can try
|
||||
it out on some existing articles. Type a number into the
|
||||
"Retrospectively Apply" box and click "Submit". After a while
|
||||
(exactly how long depends on your system's cron configuration) the new
|
||||
articles should be available.
|
||||
</p>
|
||||
<h3>Techniques</h3>
|
||||
<p>
|
||||
You can leave the attribute and value blank to include all the
|
||||
corresponding elements with the specified tag name. You can also use
|
||||
a tag name of just an asterisk ("*"), which will match any element type with the
|
||||
specified attribute regardless of the tag.
|
||||
</p>
|
||||
<p>
|
||||
Note that the "class" attribute is a special case. Many web page
|
||||
templates will put multiple different classes in the same element,
|
||||
separated by spaces. If you specify an attribute of "class" it will
|
||||
match an element if any of its classes matches the specified value.
|
||||
For example:
|
||||
</p>
|
||||
<pre>
|
||||
<div class="article breaking-news">
|
||||
</pre>
|
||||
<p>
|
||||
In this case you can specify a value of "article", or "breaking-news".
|
||||
You can also specify "article breaking-news", but that won't match if
|
||||
the website suddenly changes to "breaking-news article", so that's not
|
||||
recommended.
|
||||
</p>
|
||||
<p>
|
||||
One useful trick you can try is using the website's "print" pages.
|
||||
Many news sites have print versions of all their articles. These are
|
||||
usually drastically simplified compared to the live website page.
|
||||
Sometimes this is a good way to get the whole article when it's
|
||||
normally split across multiple pages.
|
||||
</p>
|
||||
<p>
|
||||
Hopefully the URL for the print page is a predictable variant of the
|
||||
normal article URL. For example, an article URL like:
|
||||
</p>
|
||||
<pre>
|
||||
http://www.newssite.com/article-8636.html
|
||||
</pre>
|
||||
<p>
|
||||
...might have a print version at:
|
||||
</p>
|
||||
<pre>
|
||||
http://www.newssite.com/print/article-8636.html
|
||||
</pre>
|
||||
<p>
|
||||
To change the URL used to retrieve the page, use the "URL Pattern" and
|
||||
"URL Replace" fields. The pattern is a regular expression matching
|
||||
part of the URL to replace. In this case, you might use a pattern of
|
||||
"/article" and a replace string of "/print/article". A common pattern
|
||||
is simply a dollar sign ("$"), used to add the replace string to the end of the URL.
|
||||
</p>
|
||||
<h3>Background Processing</h3>
|
||||
<p>
|
||||
Note that retrieving and processing the articles can take some time,
|
||||
so it's done in the background. Incoming articles will be marked as
|
||||
invisible while they're in the process of being downloaded. If a URL
|
||||
fails, the plugin will keep trying at progressively longer intervals
|
||||
for up to a month, in case the website is temporarily overloaded or
|
||||
the network is down.
|
||||
</p>
|
||||
{{if $allow_images}}
|
||||
<h3>Retrieving Images</h3>
|
||||
<p>
|
||||
Retriever can also optionally download images and store them in the
|
||||
local Friendica instance. Just check the "Download Images" box. You
|
||||
can also download images in every item from your network, whether it's
|
||||
an RSS feed or not. Go to the "Settings" page and
|
||||
click <a href="$config">"Plugin settings"</a>. Then check the "All
|
||||
Photos" box in the "Retriever Settings" section and click "Submit".
|
||||
</p>
|
||||
{{/if}}
|
||||
<h2>Configure Feeds:</h2>
|
||||
<div>
|
||||
{{foreach $feeds as $feed}}
|
||||
<div class="contact-entry-wrapper" id="contact-entry-wrapper-{{$feed.id}}">
|
||||
<a href="{{$feed.url}} title="{{$feed.img_hover}}">
|
||||
<div class="contact-entry-photo-wrapper">
|
||||
<div class="contact-entry-photo mframe" id="contact-entry-photo-{{$feed.id}}">
|
||||
<img src="{{$feed.thumb}}" {{$feed.sparkle}} alt="{{$feed.name}}"/>
|
||||
</div>
|
||||
</div>
|
||||
<div class="contact-entry-desc">
|
||||
<div class="contact-entry-name" id="contact-entry-name-{{$feed.id}}">
|
||||
{{$feed.name}}
|
||||
</div>
|
||||
</div>
|
||||
</a>
|
||||
</div>
|
||||
{{/foreach}}
|
||||
</div>
|
154
retriever/templates/rule-config.tpl
Normal file
154
retriever/templates/rule-config.tpl
Normal file
|
@ -0,0 +1,154 @@
|
|||
<div class="settings-block">
|
||||
<script language="javascript">
|
||||
function retriever_add_row(id)
|
||||
{
|
||||
var tbody = document.getElementById(id);
|
||||
var last = tbody.rows[tbody.childElementCount - 1];
|
||||
var count = +last.id.replace(id + '-', '');
|
||||
count++;
|
||||
var row = document.createElement('tr');
|
||||
row.id = id + '-' + count;
|
||||
var cell1 = document.createElement('td');
|
||||
var inptag = document.createElement('input');
|
||||
inptag.name = row.id + '-element';
|
||||
cell1.appendChild(inptag);
|
||||
row.appendChild(cell1);
|
||||
var cell2 = document.createElement('td');
|
||||
var inpatt = document.createElement('input');
|
||||
inpatt.name = row.id + '-attribute';
|
||||
cell2.appendChild(inpatt);
|
||||
row.appendChild(cell2);
|
||||
var cell3 = document.createElement('td');
|
||||
var inpval = document.createElement('input');
|
||||
inpval.name = row.id + '-value';
|
||||
cell3.appendChild(inpval);
|
||||
row.appendChild(cell3);
|
||||
var cell4 = document.createElement('td');
|
||||
var butrem = document.createElement('input');
|
||||
butrem.id = row.id + '-rem';
|
||||
butrem.type = 'button';
|
||||
butrem.onclick = function(){retriever_remove_row(id, count)};
|
||||
butrem.value = '{{$remove_t}}';
|
||||
cell4.appendChild(butrem);
|
||||
row.appendChild(cell4);
|
||||
tbody.appendChild(row);
|
||||
}
|
||||
|
||||
function retriever_remove_row(id, number)
|
||||
{
|
||||
var tbody = document.getElementById(id);
|
||||
var row = document.getElementById(id + '-' + number);
|
||||
tbody.removeChild(row);
|
||||
}
|
||||
|
||||
function retriever_toggle_url_block()
|
||||
{
|
||||
var pattern = document.querySelector("#id_retriever_pattern").parentNode;
|
||||
if (document.querySelector("#id_retriever_modurl").checked) {
|
||||
pattern.style.display = "block";
|
||||
}
|
||||
else {
|
||||
pattern.style.display = "none";
|
||||
}
|
||||
|
||||
var replace = document.querySelector("#id_retriever_replace").parentNode;
|
||||
if (document.querySelector("#id_retriever_modurl").checked) {
|
||||
replace.style.display = "block";
|
||||
}
|
||||
else {
|
||||
replace.style.display = "none";
|
||||
}
|
||||
}
|
||||
|
||||
function retriever_toggle_cookiedata_block()
|
||||
{
|
||||
var div = document.querySelector("#id_retriever_cookiedata").parentNode;
|
||||
if (document.querySelector("#id_retriever_storecookies").checked) {
|
||||
div.style.display = "block";
|
||||
}
|
||||
else {
|
||||
div.style.display = "none";
|
||||
}
|
||||
}
|
||||
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
retriever_toggle_url_block();
|
||||
document.querySelector("#id_retriever_modurl").addEventListener('change', retriever_toggle_url_block, false);
|
||||
retriever_toggle_cookiedata_block();
|
||||
document.querySelector("#id_retriever_storecookies").addEventListener('change', retriever_toggle_cookiedata_block, false);
|
||||
}, false);
|
||||
</script>
|
||||
<h2>{{$title}}</h2>
|
||||
<p><a href="{{$help}}">{{$help_t}}</a></p>
|
||||
<form method="post">
|
||||
<input type="hidden" name="id" value="{{$id}}">
|
||||
{{include file="field_checkbox.tpl" field=$enable}}
|
||||
<h3>{{$include_t}}:</h3>
|
||||
<div>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>{{$tag_t}}</th><th>{{$attribute_t}}</th><th>{{$value_t}}</th></tr>
|
||||
</thead>
|
||||
<tbody id="retriever-include">
|
||||
{{if $include}}
|
||||
{{foreach $include as $k=>$m}}
|
||||
<tr id="retriever-include-{{$k}}">
|
||||
<td><input name="retriever-include-{{$k}}-element" value="{{$m.element}}"></td>
|
||||
<td><input name="retriever-include-{{$k}}-attribute" value="{{$m.attribute}}"></td>
|
||||
<td><input name="retriever-include-{{$k}}-value" value="{{$m.value}}"></td>
|
||||
<td><input id="retrieve-include-{{$k}}-rem" type="button" onclick="retriever_remove_row('retriever-include', {{$k}})" value="{{$remove_t}}"></td>
|
||||
</tr>
|
||||
{{/foreach}}
|
||||
{{else}}
|
||||
<tr id="retriever-include-0">
|
||||
<td><input name="retriever-include-0-element"></td>
|
||||
<td><input name="retriever-include-0-attribute"></td>
|
||||
<td><input name="retriever-include-0-value"></td>
|
||||
<td><input id="retrieve-include-0-rem" type="button" onclick="retriever_remove_row('retriever-include', 0)" value="{{$remove_t}}"></td>
|
||||
</tr>
|
||||
{{/if}}
|
||||
</tbody>
|
||||
</table>
|
||||
<input type="button" onclick="retriever_add_row('retriever-include')" value="{{$add_t}}">
|
||||
</div>
|
||||
<h3>{{$exclude_t}}:</h3>
|
||||
<div>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>{{$tag_t}}</th><th>{{$attribute_t}}</th><th>{{$value_t}}</th></tr>
|
||||
</thead>
|
||||
<tbody id="retriever-exclude">
|
||||
{{if $exclude}}
|
||||
{{foreach $exclude as $k=>$r}}
|
||||
<tr id="retriever-exclude-{{$k}}">
|
||||
<td><input name="retriever-exclude-{{$k}}-element" value="{{$r.element}}"></td>
|
||||
<td><input name="retriever-exclude-{{$k}}-attribute" value="{{$r.attribute}}"></td>
|
||||
<td><input name="retriever-exclude-{{$k}}-value" value="{{$r.value}}"></td>
|
||||
<td><input id="retrieve-exclude-{{$k}}-rem" type="button" onclick="retriever_remove_row('retriever-exclude', {{$k}})" value="{{$remove_t}}"></td>
|
||||
</tr>
|
||||
{{/foreach}}
|
||||
{{else}}
|
||||
<tr id="retriever-exclude-0">
|
||||
<td><input name="retriever-exclude-0-element"></td>
|
||||
<td><input name="retriever-exclude-0-attribute"></td>
|
||||
<td><input name="retriever-exclude-0-value"></td>
|
||||
<td><input id="retrieve-exclude-0-rem" type="button" onclick="retriever_remove_row('retriever-exclude', 0)" value="{{$remove_t}}"></td>
|
||||
</tr>
|
||||
{{/if}}
|
||||
</tbody>
|
||||
</table>
|
||||
<input type="button" onclick="retriever_add_row('retriever-exclude')" value="{{$add_t}}">
|
||||
</div>
|
||||
{{include file="field_checkbox.tpl" field=$modurl}}
|
||||
{{include file="field_input.tpl" field=$pattern}}
|
||||
{{include file="field_input.tpl" field=$replace}}
|
||||
{{if $allow_images}}
|
||||
{{include file="field_checkbox.tpl" field=$images}}
|
||||
{{/if}}
|
||||
{{include file="field_textarea.tpl" field=$customxslt}}
|
||||
{{include file="field_checkbox.tpl" field=$storecookies}}
|
||||
{{include file="field_textarea.tpl" field=$cookiedata}}
|
||||
{{include file="field_input.tpl" field=$retrospective}}
|
||||
<input type="submit" size="70" value="{{$submit_t}}">
|
||||
</form>
|
||||
</div>
|
16
retriever/templates/settings.tpl
Normal file
16
retriever/templates/settings.tpl
Normal file
|
@ -0,0 +1,16 @@
|
|||
<span id="settings_retriever_inflated" class="settings-block fakelink" style="display: block;" onclick="openClose('settings_retriever_expanded'); openClose('settings_retriever_inflated');">
|
||||
<h3>{{$title}}</h3>
|
||||
</span>
|
||||
<div id="settings_retriever_expanded" class="settings-block" style="display: none;">
|
||||
<span class="fakelink" onclick="openClose('settings_retriever_expanded'); openClose('settings_retriever_inflated');">
|
||||
<h3>{{$title}}</h3>
|
||||
</span>
|
||||
<p>
|
||||
<a href="{{$help}}">Get Help</a>
|
||||
</p>
|
||||
{{if $allow_images}}
|
||||
{{include file="field_checkbox.tpl" field=$allphotos}}
|
||||
{{/if}}
|
||||
{{include file="field_checkbox.tpl" field=$oembed}}
|
||||
<input type="submit" value="{{$submit}}">
|
||||
</div>
|
Loading…
Add table
Add a link
Reference in a new issue