"""Base handler class and common utilities for handling webmentions.
Used in publish.py and blog_webmention.py.
Webmention spec: http://webmention.org/
"""
import logging
from flask import jsonify, request
from flask.views import View
from google.cloud import error_reporting
from webutil.util import json_dumps, json_loads
from webutil import flask_util
import werkzeug.exceptions
from flask_app import app
import util
logger = logging.getLogger(__name__)
[docs]
@app.route('/publish/<any(bluesky,flickr,github,mastodon):silo>',
methods=['GET', 'HEAD'])
@flask_util.headers({'Cache-Control': 'public, max-age=86400'})
def webmention_get_or_head(silo):
"""Serves webmention discovery for HEADs to webmention endpoints."""
return f"""\
<!DOCTYPE html>
<html><head>
<link rel="webmention" href="{util.host_url('/publish/webmention')}">
</head>
<body>Nothing here! <a href="/about">Try the docs instead.</a></body>
</html>""", {
'Link': f'<{util.host_url("/publish/webmention")}>; rel="webmention"',
}
[docs]
class Webmention(View):
"""Webmention base view.
Attributes:
* source (models.Source): for this webmention
* entity (models.Publish or models.Webmention) entity for this webmention
"""
source = None
entity = None
[docs]
def fetch_mf2(self, url, id=None, require_mf2=True, raise_errors=False):
"""Fetches a URL and extracts its mf2 data.
Side effects: sets ``entity.html`` on success, calls :attr:`error` on
errors.
Args:
url: str
id: str, optional id of specific element to extract and parse. defaults
to the whole page.
require_mf2: boolean, whether to return error if no mf2 are found
raise_errors: boolean, whether to let error exceptions propagate up or
handle them
Returns:
(requests.Response, mf2 data dict) tuple:
"""
try:
resp = util.requests_get(url)
resp.raise_for_status()
except werkzeug.exceptions.HTTPException:
# raised by us, probably via self.error()
raise
except BaseException as e:
logger.warning(repr(e))
util.interpret_http_exception(e) # log HTTP error, if any
if raise_errors:
raise
self.error(f'Could not fetch source URL {url}')
if self.entity:
self.entity.html = resp.text
# parse microformats
soup = util.parse_html(resp)
mf2 = util.parse_mf2(soup, url=resp.url, id=id)
if id and not mf2:
self.error(f'Got fragment {id} but no element found with that id.')
# special case tumblr's markup: div#content > div.post > div.copy
# convert to mf2 and re-parse
if not mf2.get('items'):
contents = soup.find_all(id='content')
if contents:
post = contents[0].find_next(class_='post')
if post:
post['class'] = 'h-entry'
copy = post.find_next(class_='copy')
if copy:
copy['class'] = 'e-content'
photo = post.find_next(class_='photo-wrapper')
if photo:
img = photo.find_next('img')
if img:
img['class'] = 'u-photo'
# TODO: i should be able to pass post or contents[0] to mf2py instead
# here, but it returns no items. mf2py bug?
doc = str(post)
mf2 = util.parse_mf2(doc, resp.url)
logger.debug(f'Parsed microformats2: {json_dumps(mf2, indent=2)}')
items = mf2.get('items', [])
if require_mf2 and (not items or not items[0]):
self.error('No microformats2 data found in ' + resp.url, data=mf2, html=f"""
No <a href="http://microformats.org/get-started">microformats</a> or
<a href="http://microformats.org/wiki/microformats2">microformats2</a> found in
<a href="{resp.url}">{util.pretty_link(resp.url)}</a>! See <a href="http://indiewebify.me/">indiewebify.me</a>
for details (skip to level 2, <em>Publishing on the IndieWeb</em>).
""")
return resp, mf2
[docs]
def error(self, error, html=None, status=400, data=None, log_exception=False,
report=False, extra_json=None, http_response=True):
"""Handle an error. May be overridden by subclasses.
Args:
error (str): human-readable error message
html (str): HTML human-readable error message
status (int): HTTP response status code
data (dict): mf2 data parsed from source page
log_exception (bool): whether to include a stack trace in the log msg
report (bool): whether to report to StackDriver Error Reporting
extra_json (dict): to be merged into the JSON response body
http_response (bool): whether to returning an error HTTP response
"""
if self.entity and self.entity.status == 'new':
self.entity.status = 'failed'
self.entity.put()
resp = {'error': error}
if data:
resp['parsed'] = data
if extra_json:
assert 'error' not in extra_json
assert 'parsed' not in extra_json
resp.update(extra_json)
if report and status != 404:
self.report_error(error, status=status)
if http_response:
flask_util.error(str(resp), status=status, response=jsonify(resp),
exc_info=log_exception)
[docs]
def report_error(self, resp, status=None):
"""Report an error to StackDriver Error reporting."""
# don't report specific known failures
if ('Deadline exceeded while waiting for HTTP response' in resp or
'urlfetch.Fetch() took too long' in resp or
# WordPress Jetpack bugs
# https://github.com/snarfed/bridgy/issues/161
'"resp": "invalid_input"' in resp or
# https://github.com/snarfed/bridgy/issues/750
'"error": "jetpack_verification_failed"' in resp or
# https://console.cloud.google.com/errors/CMjIg52NkMLQYA?project=brid-gy
'The Jetpack site encountered an error and could not process the API request' in resp or
# https://console.cloud.google.com/errors/CL6xvLS7k6qE3QE?project=brid-gy
'The Jetpack site is inaccessible or returned an error' in resp or
# Blogger known bug
# https://github.com/snarfed/bridgy/issues/175
'bX-2i87au' in resp or
# Tumblr: transient Disqus error looking up thread
# https://github.com/snarfed/bridgy/issues/177
"Invalid argument, 'thread': Unable to find thread" in resp or
# expected for partially set up tumblr accounts
"we haven't found your Disqus account" in resp or
# Twitter 5MB image file size limit
'"message":"Image file size must be' in resp or
# Twitter media file number limits
'Tweet with media must have exactly 1 gif or video' in resp or
# Facebook image type/size req'ts
'Missing or invalid image file' in resp or
"Your photos couldn't be uploaded. Photos should be less than 4 MB" in resp or
# Twitter duplicate publish attempts
'Status is a duplicate.' in resp or
'You have already favorited this status.' in resp or
'You have already retweeted this' in resp or
# Facebook duplicate publish attempts
'This status update is identical to the last one you posted.' in resp or
# WordPress duplicate comment
# "error": "Error: 409 HTTP Error 409: Conflict; {\n \"error\": \"comment_duplicate\",\n \"message\": \"Duplicate comment detected; it looks as though you’ve already said that!\"\n}\n"
'comment_duplicate' in resp):
return
subject = '%s %s' % (self.__class__.__name__,
'%s %s' % (self.entity.type, self.entity.status)
if self.entity else 'failed')
user = self.source.bridgy_url() if self.source else None
util.report_error(subject, user=user,
http_context=error_reporting.HTTPContext(
method=request.method,
url=request.url,
response_status_code=status,
remote_ip=request.remote_addr))