poobrains-pdn/pdn.py

622 lines
19 KiB
Python
Raw Permalink Normal View History

2017-03-12 22:43:46 +00:00
#! /usr/bin/env python
# -*- coding: utf-8 -*-
2017-06-27 15:20:16 +00:00
import math
2017-07-09 01:27:34 +00:00
import string
2017-04-18 03:58:01 +00:00
import os
2017-03-13 00:59:42 +00:00
import random
import datetime
import requests
import bs4
2017-04-18 03:58:01 +00:00
import markdown
2017-07-09 21:32:25 +00:00
import click
import flask
2017-03-12 22:43:46 +00:00
import poobrains
2017-05-25 05:46:22 +00:00
from io import BytesIO
2017-05-25 05:59:03 +00:00
from PIL import Image, ImageDraw, ImageFont, ImageSequence
2017-04-18 03:58:01 +00:00
2017-03-12 22:43:46 +00:00
app = poobrains.app
@app.route('/')
def front():
try:
return poobrains.redirect(poobrains.auth.User.load('phryk').url('full'))
except Exception:
return poobrains.redirect(Article.url())
2017-05-26 15:59:15 +00:00
class MemePattern(markdown.inlinepatterns.Pattern):
2017-04-18 03:58:01 +00:00
name = None
def __init__(self, pattern, name, md=None):
2017-04-18 03:58:01 +00:00
super(MemePattern, self).__init__(pattern, md=md)
2017-04-18 03:58:01 +00:00
self.name = name
def handleMatch(self, match):
if match:
2017-05-26 15:59:15 +00:00
caption = match.group(2)
if not MemeWhiteList.select().where(MemeWhiteList.caption == caption).count():
cache_entry = MemeWhiteList()
cache_entry.caption = caption
cache_entry.save()
2017-04-18 03:58:01 +00:00
element = markdown.util.etree.Element('img')
2017-05-26 15:59:15 +00:00
element.set('src', "/meme/%s/%s" % (self.name, caption))
element.set('alt', caption)
2017-04-18 03:58:01 +00:00
return element
2017-05-26 15:59:15 +00:00
return super(MemePattern, self).handleMatch(match)
2017-04-18 03:58:01 +00:00
class Memextension(markdown.Extension):
def extendMarkdown(self, md, md_globals):
2019-02-10 14:56:21 +00:00
if 'MEMES' in app.config:
for name in app.config['MEMES']:
md.inlinePatterns.add(name, MemePattern('<%s>(.*?)</%s>' % (name, name), name), '<reference')
2017-04-18 03:58:01 +00:00
poobrains.md.md.registerExtensions([Memextension()], [])
2017-10-21 00:25:40 +00:00
2017-05-26 15:59:15 +00:00
@app.expose('/meme/<string:name>/<string:caption>')
2017-04-18 03:58:01 +00:00
class Mememage(poobrains.auth.Protected):
2017-05-26 15:59:15 +00:00
def view(self, mode='full', name=None, caption=None):
if name in app.config['MEMES'] and MemeWhiteList.select().where(MemeWhiteList.caption == caption).count():
2017-04-18 03:58:01 +00:00
# TODO: is input sanitation still needed?
2017-05-26 15:59:15 +00:00
if ':' in caption and len(caption.split(':')) == 2:
upper, lower = caption.split(':')
else:
upper = None
2017-05-26 15:59:15 +00:00
lower = caption
2017-04-18 03:58:01 +00:00
2017-05-25 05:46:22 +00:00
filename = os.path.join(app.root_path, app.config['MEMES'][name])
extension = filename.split('.')[-1]
template = Image.open(filename)
font = ImageFont.truetype(os.path.join(app.root_path, 'LeagueGothic-Regular.otf'), 80)
2017-04-23 00:23:42 +00:00
2017-05-25 05:46:22 +00:00
resized = (750, int(template.height * (750.0 / template.width)))
2017-04-23 00:45:03 +00:00
2017-05-25 05:46:22 +00:00
if upper:
upper_size = font.getsize(upper)
upper_x = int(round(resized[0] / 2.0 - upper_size[0] / 2.0))
upper_y = 0
2017-04-23 00:45:03 +00:00
2017-05-25 05:46:22 +00:00
if lower:
lower_size = font.getsize(lower)
lower_x = int(round(resized[0] / 2.0 - lower_size[0] / 2.0))
lower_y = resized[1] - lower_size[1] - 10 # last int is margin from bottom
2017-04-23 00:23:42 +00:00
2017-05-25 05:46:22 +00:00
if extension == 'gif':
frames = []
for frame in ImageSequence.Iterator(template):
frame = frame.convert('RGBA').resize(resized, Image.BICUBIC)
2017-05-25 05:59:03 +00:00
text_layer = Image.new('RGBA', frame.size, (0,0,0,0))
text_draw = ImageDraw.Draw(text_layer)
2017-05-25 05:46:22 +00:00
2017-04-23 00:23:42 +00:00
if upper:
2017-05-25 05:59:03 +00:00
outlined_text(text_draw, upper, upper_x, upper_y, font=font)
2017-05-25 05:46:22 +00:00
2017-04-23 00:23:42 +00:00
if lower:
2017-05-25 05:59:03 +00:00
outlined_text(text_draw, lower, lower_x, lower_y, font=font)
2017-05-25 05:46:22 +00:00
2017-05-25 05:59:03 +00:00
frames.append(Image.alpha_composite(frame, text_layer))
2017-05-25 05:46:22 +00:00
meme = frames.pop(0).convert('P')
2019-02-10 14:56:21 +00:00
if 'duration' in template.info:
2017-05-25 05:46:22 +00:00
meme.info['duration'] = template.info['duration']
2019-02-10 14:56:21 +00:00
if 'loop' in template.info:
2017-05-25 05:46:22 +00:00
meme.info['loop'] = template.info['loop']
out = BytesIO()
meme.save(out, save_all=True, append_images=frames, format='GIF')
2017-04-23 00:23:42 +00:00
2017-05-24 13:51:21 +00:00
r = flask.Response(
2017-05-25 05:46:22 +00:00
out.getvalue(),
mimetype='image/gif'
2017-04-23 00:23:42 +00:00
)
2017-05-24 13:51:21 +00:00
r.cache_control.public = True
r.cache_control.max_age = 604800
2017-04-23 00:23:42 +00:00
2017-05-24 13:51:21 +00:00
return r
2017-04-23 00:23:42 +00:00
2017-05-25 05:46:22 +00:00
else:
2017-05-25 05:46:22 +00:00
meme = template.convert('RGBA').resize(resized, Image.BICUBIC)
text_layer = Image.new('RGBA', resized, (0,0,0,0))
text_draw = ImageDraw.Draw(text_layer)
if upper:
2017-05-25 05:46:22 +00:00
outlined_text(text_draw, upper, upper_x, upper_y, font=font)
if lower:
2017-05-25 05:46:22 +00:00
outlined_text(text_draw, lower, lower_x, lower_y, font=font)
meme = Image.alpha_composite(meme, text_layer)
2017-05-25 05:46:22 +00:00
out = BytesIO()
meme.save(out, format='PNG')
#img.save(filename='memes/foo.png')
r = flask.Response(
out.getvalue(),
mimetype='image/png'
)
r.cache_control.public = True
r.cache_control.max_age = 604800
2017-05-25 05:46:22 +00:00
return r
2017-04-18 03:58:01 +00:00
raise poobrains.auth.AccessDenied()
2017-03-13 00:59:42 +00:00
2017-05-25 05:46:22 +00:00
def outlined_text(drawing, text, x=0, y=0, font=None):
drawing.text((x-1, y-1), text, font=font, fill=(0,0,0,255))
drawing.text((x-1, y+1), text, font=font, fill=(0,0,0,255))
drawing.text((x+1, y+1), text, font=font, fill=(0,0,0,255))
drawing.text((x+1, y-1), text, font=font, fill=(0,0,0,255))
drawing.text((x, y), text, font=font, fill=(255,255,255,255))
2017-03-13 00:59:42 +00:00
# content types
2017-05-26 15:59:15 +00:00
class MemeWhiteList(poobrains.storage.Model):
# TODO: add functionality to remove references to deleted/removed storable instances
caption = poobrains.storage.fields.CharField()
class ScoredLink(poobrains.auth.Administerable):
2017-10-21 00:25:40 +00:00
class Meta:
form_blacklist = ['id', 'external_site_count', 'updated']
2017-07-09 01:27:34 +00:00
link = poobrains.storage.fields.CharField(unique=True) # TODO: Add an URLField to poobrains.
external_site_count = poobrains.storage.fields.IntegerField(null=True)
updated = poobrains.storage.fields.DateTimeField(null=False, default=datetime.datetime.now)
2017-06-27 15:20:16 +00:00
mean = None
median = None
set_size = None
def scrape_external_site_count(self):
2017-05-12 23:39:05 +00:00
external_site_count = 0
if self.link:
link_domain = self.link.split('/')[2]
2018-01-04 15:22:15 +00:00
2017-07-09 21:32:25 +00:00
html = requests.get(self.link, timeout=30).text
dom = bs4.BeautifulSoup(html, 'lxml')
scored_elements = {
'script': 'src',
2018-01-04 15:22:15 +00:00
'link': 'href',
'img': 'src',
'object': 'data'
}
for tag, attribute in scored_elements.items():
for element in dom.find_all(tag):
attribute_value = element.get(attribute)
if isinstance(attribute_value, str) and attribute_value.find('://') >= 0: # means this isn't a relative link
attribute_domain = attribute_value.split('/')[2]
2018-01-04 15:22:15 +00:00
if attribute_domain != link_domain and \
not attribute_domain.endswith('.%s' % link_domain): # whether attribute_domain is a subdomain of link_domain
external_site_count += 1
return external_site_count
def save(self, *args, **kwargs):
try:
self.external_site_count = self.scrape_external_site_count()
self.updated = datetime.datetime.now()
except Exception as e: # Match all errors so failures here don't interfere with normal operations
poobrains.app.logger.error('Could not scrape external site count for URL: %s' % self.link)
2019-02-10 14:56:21 +00:00
poobrains.app.logger.debug('Problem when scraping external site count: %s: %s' % (str(type(e)), str(e)))
#if app.debug:
# raise # break hard in debug mode to make it easier to find problems
2018-01-04 15:22:15 +00:00
return super(ScoredLink, self).save(*args, **kwargs)
2017-06-27 15:20:16 +00:00
@property
def set_size(self):
return self.__class__.select().count()
2018-01-04 15:22:15 +00:00
2017-06-27 15:20:16 +00:00
@property
def external_site_counts(self):
2017-06-27 15:20:16 +00:00
external_site_counts = []
2017-07-09 21:32:25 +00:00
for row in self.__class__.select(self.__class__.external_site_count).where(self.__class__.external_site_count != None).order_by(self.__class__.external_site_count).dicts():
2017-06-27 15:20:16 +00:00
external_site_counts.append(row['external_site_count'])
return external_site_counts
2017-06-27 15:20:16 +00:00
@property
def median(self):
2017-06-27 15:20:16 +00:00
median_idx = int(math.floor(len(self.external_site_counts) / 2.0))
if len(self.external_site_counts) % 2 == 0:
2017-06-27 15:20:16 +00:00
a = self.external_site_counts[median_idx -1]
b = self.external_site_counts[median_idx]
median = (a + b) / 2.0
2017-06-27 15:20:16 +00:00
else:
median = float(self.external_site_counts[median_idx])
return median
@property
def mean(self):
return sum(self.external_site_counts) / float(len(self.external_site_counts))
2017-06-27 15:20:16 +00:00
2017-05-29 02:49:56 +00:00
@property
def name(self):
return self.link
@app.expose('/source/organization/', mode='full')
class SourceOrganization(poobrains.commenting.Commentable):
parent = poobrains.storage.fields.ForeignKeyField('self', null=True)
title = poobrains.storage.fields.CharField()
link = poobrains.storage.fields.ForeignKeyField(ScoredLink, null=True)
2017-05-29 02:49:56 +00:00
description = poobrains.md.MarkdownField(null=True)
2017-03-12 22:43:46 +00:00
@app.expose('/source/author/', mode='full')
class SourceAuthor(poobrains.commenting.Commentable):
2017-03-12 22:43:46 +00:00
title = poobrains.storage.fields.CharField()
link = poobrains.storage.fields.ForeignKeyField(ScoredLink, null=True)
2017-05-29 02:49:56 +00:00
description = poobrains.md.MarkdownField(null=True)
2018-04-06 05:29:31 +00:00
@app.expose('/source/organizationauthor/', mode='full')
class SourceOrganizationAuthor(poobrains.commenting.Commentable):
2017-05-29 02:49:56 +00:00
organization = poobrains.storage.fields.ForeignKeyField(SourceOrganization)
author = poobrains.storage.fields.ForeignKeyField(SourceAuthor)
2017-03-12 22:43:46 +00:00
@app.expose('/source/', mode='full')
class Source(poobrains.commenting.Commentable):
2017-03-12 22:43:46 +00:00
title = poobrains.storage.fields.CharField()
2017-05-29 02:49:56 +00:00
type = poobrains.storage.fields.CharField() # TODO: We need some logic to make this useful. Also, build enum type compatible to sqlite+postgres?
author = poobrains.storage.fields.ForeignKeyField(SourceOrganizationAuthor)
link = poobrains.storage.fields.ForeignKeyField(ScoredLink, null=True)
2017-04-14 17:20:40 +00:00
description = poobrains.md.MarkdownField()
2017-03-12 22:43:46 +00:00
@app.expose('/article/', mode='full')
class Article(poobrains.commenting.Commentable):
2017-03-12 22:43:46 +00:00
title = poobrains.storage.fields.CharField()
2017-04-14 17:20:40 +00:00
text = poobrains.md.MarkdownField()
@app.expose('/projects/', mode='full')
class Project(poobrains.commenting.Commentable):
title = poobrains.storage.fields.CharField()
text = poobrains.md.MarkdownField()
link = poobrains.storage.fields.CharField()
2017-03-12 22:43:46 +00:00
@app.expose('/curated/', mode='full')
class CuratedContent(poobrains.commenting.Commentable):
2017-03-12 22:43:46 +00:00
title = poobrains.storage.fields.CharField()
2017-04-14 17:20:40 +00:00
description = poobrains.md.MarkdownField()
link = poobrains.storage.fields.ForeignKeyField(ScoredLink, null=True)
2017-03-16 16:56:17 +00:00
@app.site.box('menu_main')
def menu_main():
menu = poobrains.rendering.Menu('main')
try:
menu.append(Article.url('teaser'), 'Articles')
2017-04-18 03:58:01 +00:00
except poobrains.auth.AccessDenied:
pass
try:
menu.append(Project.url('teaser'), 'Projects')
except poobrains.auth.AccessDenied:
2017-03-16 16:56:17 +00:00
pass
try:
CuratedContent.url('teaser')
menu.append(CuratedContent.url('teaser'), 'Curated content')
2017-04-18 03:58:01 +00:00
except poobrains.auth.AccessDenied:
2017-03-16 16:56:17 +00:00
pass
try:
menu.append(Source.url('teaser'), 'Sources')
2017-04-18 03:58:01 +00:00
except poobrains.auth.AccessDenied:
2017-03-16 16:56:17 +00:00
pass
2022-05-21 19:57:31 +00:00
for url, caption in poobrains.auth.Page.main_menu_entries():
menu.append(url, caption)
2017-03-16 16:56:17 +00:00
return menu
2017-03-12 22:43:46 +00:00
2017-03-13 00:59:42 +00:00
DOGE = {
'prefix': [
'wow',
'such',
'many',
'more',
'so',
'lol',
'very',
'omg'
],
'thing': [
'wow',
'doge',
'shibe',
'1337 h4xx0rz',
'internet',
'pretty',
'computer',
'free software',
'website',
'content',
'python',
'flask',
'poobrains',
'NOT PHP'
],
'thing_tls': [
'transport layer security',
'X.509',
'certificate'
],
'suffix': [
'wow',
'pls',
'mystery',
'anarchy',
'bees'
]
}
@app.after_request
def mkdoge(response):
items = [
DOGE['prefix'],
DOGE['thing'] + DOGE['thing_tls'] if flask.request.is_secure else DOGE['thing'],
DOGE['suffix']
]
doge = []
for l in items:
doge.append(l[random.randint(0, len(l) - 1)])
response.headers['X-Doge'] = ' '.join(doge)
return response
2017-07-09 01:27:34 +00:00
## ##
## waffenfunde infoscraping things ##
## ##
2017-07-09 21:32:25 +00:00
MONITOR_PATTERNS = ['waffenfund', 'waffe gefunden', 'waffen gefunden']
2017-07-09 01:27:34 +00:00
2018-01-04 15:22:15 +00:00
@poobrains.app.cron
def scrape_linkscores():
now = datetime.datetime.now()
period = datetime.timedelta(days=7)
count = 0 # keep track of how many scores we actually update
with click.progressbar(ScoredLink.select(), label="Update link scores where necessary", item_show_func=lambda x=None: x.link if x else '') as links: # iterates through all non-abstract Models
for link in links:
2018-01-04 15:22:15 +00:00
if now - period > link.updated: # update at most once per `period`
link.save() # ScoredLink scores are updated on every .save
count += 1
click.secho(f"Updated {count} link scores.", fg='green')
2018-01-04 15:22:15 +00:00
#@poobrains.app.cron # bitrotted inactive research, disable
2017-07-09 01:27:34 +00:00
def scrape_blaulicht():
owner = poobrains.auth.User.get(poobrains.auth.User.id == 2)
for pattern in MONITOR_PATTERNS:
article_urls = []
2017-07-09 21:32:25 +00:00
html = requests.get('http://www.presseportal.de/blaulicht/suche.htx?q=%s' % pattern, timeout=30).text
dom = bs4.BeautifulSoup(html, 'lxml')
2017-07-09 01:27:34 +00:00
2017-07-09 21:32:25 +00:00
click.echo("Beginning crawl of pagination for search pattern '%s'." % pattern)
2017-07-09 01:27:34 +00:00
last_page = False
while not last_page:
next_page = dom.find(attrs={'class': 'pagination-next'})
if next_page == None:
last_page = True
else:
# Why in the name of FUCK would you use spans with data-url for fucking links!?
next_page_url = 'http://www.presseportal.de/blaulicht/%s' % next_page['data-url']
for article in dom.find_all('article'):
try:
article_urls.append(article.find('h2', attrs={'class': 'news-headline'}).a['href'])
except Exception as e:
click.echo("Article appears to be without headline link, skipping")
2017-07-09 01:27:34 +00:00
if not last_page:
2017-07-09 21:32:25 +00:00
click.echo("Next page: %s" % next_page_url)
dom = bs4.BeautifulSoup(requests.get(next_page_url, timeout=30).text, 'lxml')
2017-07-09 01:27:34 +00:00
2017-07-09 21:32:25 +00:00
click.echo("URL collection done, found %d articles." % len(article_urls))
2017-07-09 01:27:34 +00:00
2017-07-09 21:32:25 +00:00
click.echo("Beginning crawl of individual articles.")
2017-07-09 01:27:34 +00:00
for article_url in article_urls:
2017-07-09 21:32:25 +00:00
url = 'http://www.presseportal.de%s' % article_url
try:
testlink = ScoredLink.get(ScoredLink.link == url)
if Source.select().where(Source.link == testlink).count():
click.echo("Already know source with link %s, skipping." % url)
continue
except ScoredLink.DoesNotExist:
pass
try:
dom = bs4.BeautifulSoup(requests.get(url, timeout=30).text, 'lxml')
2017-07-10 21:01:13 +00:00
except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
2019-02-10 14:56:21 +00:00
message = '%s for %s: %s' % (type(e).__name__, url, str(e))
2017-07-09 21:32:25 +00:00
click.echo(message)
poobrains.app.logger.error(message)
continue
2017-07-09 01:27:34 +00:00
2017-07-09 14:26:57 +00:00
try:
org_dom = dom.find('h2', attrs={'class': 'story-company'}).a
except Exception as e:
2017-07-09 21:32:25 +00:00
message = "Couldn't extract source organization for %s" % url
click.echo(message)
poobrains.app.logger.error(message)
2017-07-09 14:26:57 +00:00
continue
2017-07-09 01:27:34 +00:00
try:
org = SourceOrganization.get(SourceOrganization.title == org_dom.text)
except SourceOrganization.DoesNotExist:
2017-07-10 21:01:13 +00:00
org_url = 'http://www.presseportal.de%s' % org_dom['href']
try:
org_link = ScoredLink.get(ScoredLink.link == org_url)
except ScoredLink.DoesNotExist:
org_link = ScoredLink()
org_link.link = org_url
org_link.save()
2017-07-09 01:27:34 +00:00
org = SourceOrganization()
org.name = poobrains.helpers.clean_string(org_dom.text)
org.title = org_dom.text
org.link = org_link
org.owner = owner
org.save()
try:
author = SourceAuthor.get(SourceAuthor.name == org.name)
except SourceAuthor.DoesNotExist:
author = SourceAuthor()
author.name = org.name
author.title = org.title
author.link = org.link
author.owner = owner
author.save()
try:
orgauthor = SourceOrganizationAuthor.get(SourceOrganizationAuthor.organization == org, SourceOrganizationAuthor.author == author)
except SourceOrganizationAuthor.DoesNotExist:
orgauthor = SourceOrganizationAuthor()
2018-04-06 05:29:31 +00:00
orgauthor.name = '%s-%s' % (org.name, author.name)
2017-07-09 01:27:34 +00:00
orgauthor.organization = org
orgauthor.author = author
2018-04-06 05:29:31 +00:00
orgauthor.owner = owner
2017-07-09 01:27:34 +00:00
orgauthor.save()
try:
source_link = ScoredLink.get(ScoredLink.link == url)
except ScoredLink.DoesNotExist:
source_link = ScoredLink()
source_link.link = url
source_link.save()
2017-07-09 21:32:25 +00:00
source_title = dom.find('h1', attrs={'class': 'story-headline'}).text.strip()
2017-07-09 14:26:57 +00:00
source_name = poobrains.helpers.clean_string(source_title)
2017-07-09 01:27:34 +00:00
try:
2017-07-09 14:26:57 +00:00
source = Source.get(Source.name == source_name)
click.echo("Already have a source named %s. Probably indicates duplicate names. Current URL %s" % (source_name, url))
2017-07-09 01:27:34 +00:00
except Source.DoesNotExist:
if poobrains.app.debug:
poobrains.app.debugger.set_trace()
date_string = dom.find(attrs={'class': 'story-date'}).text.strip().replace(u'\u2013', '-') # \u2013 is a unicode dash
2017-07-10 21:01:13 +00:00
2017-07-09 01:27:34 +00:00
source = Source()
source.link = source_link
source.type = "scrape_blaulicht"
source.author = orgauthor
2017-07-09 14:26:57 +00:00
source.title = source_title
source.name = source_name
source.description = dom.find(attrs={'class': 'story-text'}).text.strip().replace('<', '&lt;').replace('>', '&gt;')
source.date = datetime.datetime.strptime(date_string, "%d.%m.%Y - %H:%M") # format string contains a *dash*, not a minus!
2017-07-09 01:27:34 +00:00
source.owner = owner
source.save()
2017-07-09 21:32:25 +00:00
click.echo("Saved source: %s" % url)
2017-07-09 01:27:34 +00:00
2017-03-12 22:43:46 +00:00
if __name__ == '__main__':
2017-07-08 17:26:41 +00:00
app.cli()