2017-03-12 22:43:46 +00:00
#! /usr/bin/env python
# -*- coding: utf-8 -*-
2017-06-27 15:20:16 +00:00
import math
2017-07-09 01:27:34 +00:00
import string
2017-04-18 03:58:01 +00:00
import os
2017-03-13 00:59:42 +00:00
import random
2017-03-19 20:32:47 +00:00
import datetime
2017-03-17 02:25:55 +00:00
import requests
2017-03-16 22:55:59 +00:00
import bs4
2017-04-18 03:58:01 +00:00
import markdown
2017-07-09 21:32:25 +00:00
import click
2017-03-17 02:25:55 +00:00
import flask
2017-03-12 22:43:46 +00:00
import poobrains
2017-05-25 05:46:22 +00:00
from io import BytesIO
2017-05-25 05:59:03 +00:00
from PIL import Image , ImageDraw , ImageFont , ImageSequence
2017-04-18 03:58:01 +00:00
2017-03-12 22:43:46 +00:00
app = poobrains . app
2018-04-06 04:06:53 +00:00
@app.route ( ' / ' )
def front ( ) :
try :
return poobrains . redirect ( poobrains . auth . User . load ( ' phryk ' ) . url ( ' full ' ) )
except Exception :
return poobrains . redirect ( Article . url ( ) )
2017-05-26 15:59:15 +00:00
class MemePattern ( markdown . inlinepatterns . Pattern ) :
2017-04-18 03:58:01 +00:00
name = None
2020-11-13 22:40:59 +00:00
def __init__ ( self , pattern , name , md = None ) :
2017-04-18 03:58:01 +00:00
2020-11-13 22:40:59 +00:00
super ( MemePattern , self ) . __init__ ( pattern , md = md )
2017-04-18 03:58:01 +00:00
self . name = name
def handleMatch ( self , match ) :
if match :
2017-05-26 15:59:15 +00:00
caption = match . group ( 2 )
if not MemeWhiteList . select ( ) . where ( MemeWhiteList . caption == caption ) . count ( ) :
cache_entry = MemeWhiteList ( )
cache_entry . caption = caption
cache_entry . save ( )
2017-04-18 03:58:01 +00:00
element = markdown . util . etree . Element ( ' img ' )
2017-05-26 15:59:15 +00:00
element . set ( ' src ' , " /meme/ %s / %s " % ( self . name , caption ) )
element . set ( ' alt ' , caption )
2017-04-18 03:58:01 +00:00
return element
2017-05-26 15:59:15 +00:00
return super ( MemePattern , self ) . handleMatch ( match )
2017-04-18 03:58:01 +00:00
class Memextension ( markdown . Extension ) :
def extendMarkdown ( self , md , md_globals ) :
2019-02-10 14:56:21 +00:00
if ' MEMES ' in app . config :
for name in app . config [ ' MEMES ' ] :
md . inlinePatterns . add ( name , MemePattern ( ' < %s >(.*?)</ %s > ' % ( name , name ) , name ) , ' <reference ' )
2017-04-18 03:58:01 +00:00
poobrains . md . md . registerExtensions ( [ Memextension ( ) ] , [ ] )
2017-10-21 00:25:40 +00:00
2017-05-26 15:59:15 +00:00
@app.expose ( ' /meme/<string:name>/<string:caption> ' )
2017-04-18 03:58:01 +00:00
class Mememage ( poobrains . auth . Protected ) :
2017-05-26 15:59:15 +00:00
def view ( self , mode = ' full ' , name = None , caption = None ) :
if name in app . config [ ' MEMES ' ] and MemeWhiteList . select ( ) . where ( MemeWhiteList . caption == caption ) . count ( ) :
2017-04-18 03:58:01 +00:00
2017-05-25 04:35:24 +00:00
# TODO: is input sanitation still needed?
2017-05-26 15:59:15 +00:00
if ' : ' in caption and len ( caption . split ( ' : ' ) ) == 2 :
upper , lower = caption . split ( ' : ' )
2017-05-25 04:35:24 +00:00
else :
upper = None
2017-05-26 15:59:15 +00:00
lower = caption
2017-04-18 03:58:01 +00:00
2017-05-25 05:46:22 +00:00
filename = os . path . join ( app . root_path , app . config [ ' MEMES ' ] [ name ] )
extension = filename . split ( ' . ' ) [ - 1 ]
template = Image . open ( filename )
font = ImageFont . truetype ( os . path . join ( app . root_path , ' LeagueGothic-Regular.otf ' ) , 80 )
2017-04-23 00:23:42 +00:00
2017-05-25 05:46:22 +00:00
resized = ( 750 , int ( template . height * ( 750.0 / template . width ) ) )
2017-04-23 00:45:03 +00:00
2017-05-25 05:46:22 +00:00
if upper :
upper_size = font . getsize ( upper )
upper_x = int ( round ( resized [ 0 ] / 2.0 - upper_size [ 0 ] / 2.0 ) )
upper_y = 0
2017-04-23 00:45:03 +00:00
2017-05-25 05:46:22 +00:00
if lower :
lower_size = font . getsize ( lower )
lower_x = int ( round ( resized [ 0 ] / 2.0 - lower_size [ 0 ] / 2.0 ) )
lower_y = resized [ 1 ] - lower_size [ 1 ] - 10 # last int is margin from bottom
2017-04-23 00:23:42 +00:00
2017-05-25 05:46:22 +00:00
if extension == ' gif ' :
frames = [ ]
for frame in ImageSequence . Iterator ( template ) :
frame = frame . convert ( ' RGBA ' ) . resize ( resized , Image . BICUBIC )
2017-05-25 05:59:03 +00:00
text_layer = Image . new ( ' RGBA ' , frame . size , ( 0 , 0 , 0 , 0 ) )
text_draw = ImageDraw . Draw ( text_layer )
2017-05-25 05:46:22 +00:00
2017-04-23 00:23:42 +00:00
if upper :
2017-05-25 05:59:03 +00:00
outlined_text ( text_draw , upper , upper_x , upper_y , font = font )
2017-05-25 05:46:22 +00:00
2017-04-23 00:23:42 +00:00
if lower :
2017-05-25 05:59:03 +00:00
outlined_text ( text_draw , lower , lower_x , lower_y , font = font )
2017-05-25 05:46:22 +00:00
2017-05-25 05:59:03 +00:00
frames . append ( Image . alpha_composite ( frame , text_layer ) )
2017-05-25 05:46:22 +00:00
meme = frames . pop ( 0 ) . convert ( ' P ' )
2017-05-25 04:35:24 +00:00
2019-02-10 14:56:21 +00:00
if ' duration ' in template . info :
2017-05-25 05:46:22 +00:00
meme . info [ ' duration ' ] = template . info [ ' duration ' ]
2019-02-10 14:56:21 +00:00
if ' loop ' in template . info :
2017-05-25 05:46:22 +00:00
meme . info [ ' loop ' ] = template . info [ ' loop ' ]
out = BytesIO ( )
meme . save ( out , save_all = True , append_images = frames , format = ' GIF ' )
2017-04-23 00:23:42 +00:00
2017-05-25 04:35:24 +00:00
2017-05-24 13:51:21 +00:00
r = flask . Response (
2017-05-25 05:46:22 +00:00
out . getvalue ( ) ,
2017-05-25 04:35:24 +00:00
mimetype = ' image/gif '
2017-04-23 00:23:42 +00:00
)
2017-05-25 04:35:24 +00:00
2017-05-24 13:51:21 +00:00
r . cache_control . public = True
r . cache_control . max_age = 604800
2017-04-23 00:23:42 +00:00
2017-05-24 13:51:21 +00:00
return r
2017-04-23 00:23:42 +00:00
2017-05-25 05:46:22 +00:00
2017-05-25 04:35:24 +00:00
else :
2017-05-25 05:46:22 +00:00
meme = template . convert ( ' RGBA ' ) . resize ( resized , Image . BICUBIC )
text_layer = Image . new ( ' RGBA ' , resized , ( 0 , 0 , 0 , 0 ) )
text_draw = ImageDraw . Draw ( text_layer )
2017-05-25 04:35:24 +00:00
if upper :
2017-05-25 05:46:22 +00:00
outlined_text ( text_draw , upper , upper_x , upper_y , font = font )
2017-05-25 04:35:24 +00:00
if lower :
2017-05-25 05:46:22 +00:00
outlined_text ( text_draw , lower , lower_x , lower_y , font = font )
meme = Image . alpha_composite ( meme , text_layer )
2017-05-25 04:35:24 +00:00
2017-05-25 05:46:22 +00:00
out = BytesIO ( )
meme . save ( out , format = ' PNG ' )
#img.save(filename='memes/foo.png')
r = flask . Response (
out . getvalue ( ) ,
mimetype = ' image/png '
)
r . cache_control . public = True
r . cache_control . max_age = 604800
2017-05-25 04:35:24 +00:00
2017-05-25 05:46:22 +00:00
return r
2017-05-25 04:35:24 +00:00
2017-04-18 03:58:01 +00:00
raise poobrains . auth . AccessDenied ( )
2017-03-13 00:59:42 +00:00
2017-05-25 05:46:22 +00:00
def outlined_text ( drawing , text , x = 0 , y = 0 , font = None ) :
drawing . text ( ( x - 1 , y - 1 ) , text , font = font , fill = ( 0 , 0 , 0 , 255 ) )
drawing . text ( ( x - 1 , y + 1 ) , text , font = font , fill = ( 0 , 0 , 0 , 255 ) )
drawing . text ( ( x + 1 , y + 1 ) , text , font = font , fill = ( 0 , 0 , 0 , 255 ) )
drawing . text ( ( x + 1 , y - 1 ) , text , font = font , fill = ( 0 , 0 , 0 , 255 ) )
drawing . text ( ( x , y ) , text , font = font , fill = ( 255 , 255 , 255 , 255 ) )
2017-03-13 00:59:42 +00:00
# content types
2017-05-26 15:59:15 +00:00
class MemeWhiteList ( poobrains . storage . Model ) :
# TODO: add functionality to remove references to deleted/removed storable instances
caption = poobrains . storage . fields . CharField ( )
2017-03-17 02:25:55 +00:00
class ScoredLink ( poobrains . auth . Administerable ) :
2017-10-21 00:25:40 +00:00
class Meta :
form_blacklist = [ ' id ' , ' external_site_count ' , ' updated ' ]
2017-03-17 02:25:55 +00:00
2017-07-09 01:27:34 +00:00
link = poobrains . storage . fields . CharField ( unique = True ) # TODO: Add an URLField to poobrains.
2017-03-16 22:55:59 +00:00
external_site_count = poobrains . storage . fields . IntegerField ( null = True )
2017-03-19 20:32:47 +00:00
updated = poobrains . storage . fields . DateTimeField ( null = False , default = datetime . datetime . now )
2017-03-16 22:55:59 +00:00
2017-06-27 15:20:16 +00:00
mean = None
median = None
set_size = None
2017-03-17 02:25:55 +00:00
def scrape_external_site_count ( self ) :
2017-05-12 23:39:05 +00:00
2017-03-17 02:25:55 +00:00
external_site_count = 0
2017-03-16 22:55:59 +00:00
if self . link :
2017-03-17 02:25:55 +00:00
link_domain = self . link . split ( ' / ' ) [ 2 ]
2018-01-04 15:22:15 +00:00
2017-07-09 21:32:25 +00:00
html = requests . get ( self . link , timeout = 30 ) . text
2018-04-05 20:53:17 +00:00
dom = bs4 . BeautifulSoup ( html , ' lxml ' )
2017-03-17 02:25:55 +00:00
scored_elements = {
' script ' : ' src ' ,
2018-01-04 15:22:15 +00:00
' link ' : ' href ' ,
2017-03-17 02:25:55 +00:00
' img ' : ' src ' ,
' object ' : ' data '
}
2022-05-21 19:51:22 +00:00
for tag , attribute in scored_elements . items ( ) :
2017-03-17 02:25:55 +00:00
for element in dom . find_all ( tag ) :
attribute_value = element . get ( attribute )
2022-05-21 19:51:22 +00:00
if isinstance ( attribute_value , str ) and attribute_value . find ( ' :// ' ) > = 0 : # means this isn't a relative link
2017-03-17 02:25:55 +00:00
attribute_domain = attribute_value . split ( ' / ' ) [ 2 ]
2018-01-04 15:22:15 +00:00
if attribute_domain != link_domain and \
not attribute_domain . endswith ( ' . %s ' % link_domain ) : # whether attribute_domain is a subdomain of link_domain
2017-03-17 02:25:55 +00:00
external_site_count + = 1
return external_site_count
def save ( self , * args , * * kwargs ) :
try :
self . external_site_count = self . scrape_external_site_count ( )
2017-03-19 20:32:47 +00:00
self . updated = datetime . datetime . now ( )
2017-03-17 02:25:55 +00:00
except Exception as e : # Match all errors so failures here don't interfere with normal operations
2017-03-19 03:40:41 +00:00
poobrains . app . logger . error ( ' Could not scrape external site count for URL: %s ' % self . link )
2019-02-10 14:56:21 +00:00
poobrains . app . logger . debug ( ' Problem when scraping external site count: %s : %s ' % ( str ( type ( e ) ) , str ( e ) ) )
2017-03-17 02:25:55 +00:00
2022-05-21 19:51:22 +00:00
#if app.debug:
# raise # break hard in debug mode to make it easier to find problems
2018-01-04 15:22:15 +00:00
2017-03-17 02:25:55 +00:00
return super ( ScoredLink , self ) . save ( * args , * * kwargs )
2017-06-27 15:20:16 +00:00
2018-03-08 07:29:37 +00:00
@property
def set_size ( self ) :
return self . __class__ . select ( ) . count ( )
2018-01-04 15:22:15 +00:00
2017-06-27 15:20:16 +00:00
2018-03-08 07:29:37 +00:00
@property
def external_site_counts ( self ) :
2017-06-27 15:20:16 +00:00
external_site_counts = [ ]
2017-07-09 21:32:25 +00:00
for row in self . __class__ . select ( self . __class__ . external_site_count ) . where ( self . __class__ . external_site_count != None ) . order_by ( self . __class__ . external_site_count ) . dicts ( ) :
2017-06-27 15:20:16 +00:00
external_site_counts . append ( row [ ' external_site_count ' ] )
2018-03-08 07:29:37 +00:00
return external_site_counts
2017-06-27 15:20:16 +00:00
2018-03-08 07:29:37 +00:00
@property
def median ( self ) :
2017-06-27 15:20:16 +00:00
2018-03-08 07:29:37 +00:00
median_idx = int ( math . floor ( len ( self . external_site_counts ) / 2.0 ) )
if len ( self . external_site_counts ) % 2 == 0 :
2017-06-27 15:20:16 +00:00
2018-03-08 07:29:37 +00:00
a = self . external_site_counts [ median_idx - 1 ]
b = self . external_site_counts [ median_idx ]
median = ( a + b ) / 2.0
2017-06-27 15:20:16 +00:00
else :
2018-03-08 07:29:37 +00:00
median = float ( self . external_site_counts [ median_idx ] )
return median
@property
def mean ( self ) :
return sum ( self . external_site_counts ) / float ( len ( self . external_site_counts ) )
2017-06-27 15:20:16 +00:00
2017-03-17 02:25:55 +00:00
2017-05-29 02:49:56 +00:00
@property
def name ( self ) :
return self . link
2017-03-17 02:25:55 +00:00
@app.expose ( ' /source/organization/ ' , mode = ' full ' )
class SourceOrganization ( poobrains . commenting . Commentable ) :
parent = poobrains . storage . fields . ForeignKeyField ( ' self ' , null = True )
2017-03-19 20:32:47 +00:00
title = poobrains . storage . fields . CharField ( )
2017-03-17 02:25:55 +00:00
link = poobrains . storage . fields . ForeignKeyField ( ScoredLink , null = True )
2017-05-29 02:49:56 +00:00
description = poobrains . md . MarkdownField ( null = True )
2017-03-12 22:43:46 +00:00
2017-03-17 02:25:55 +00:00
@app.expose ( ' /source/author/ ' , mode = ' full ' )
2017-03-15 00:13:29 +00:00
class SourceAuthor ( poobrains . commenting . Commentable ) :
2017-03-12 22:43:46 +00:00
2017-03-19 20:32:47 +00:00
title = poobrains . storage . fields . CharField ( )
2017-03-17 02:25:55 +00:00
link = poobrains . storage . fields . ForeignKeyField ( ScoredLink , null = True )
2017-05-29 02:49:56 +00:00
description = poobrains . md . MarkdownField ( null = True )
2018-04-06 05:29:31 +00:00
@app.expose ( ' /source/organizationauthor/ ' , mode = ' full ' )
2018-04-06 05:11:27 +00:00
class SourceOrganizationAuthor ( poobrains . commenting . Commentable ) :
2017-05-29 02:49:56 +00:00
organization = poobrains . storage . fields . ForeignKeyField ( SourceOrganization )
author = poobrains . storage . fields . ForeignKeyField ( SourceAuthor )
2017-03-12 22:43:46 +00:00
2017-03-17 02:25:55 +00:00
@app.expose ( ' /source/ ' , mode = ' full ' )
2017-03-15 00:13:29 +00:00
class Source ( poobrains . commenting . Commentable ) :
2017-03-12 22:43:46 +00:00
2017-03-19 20:32:47 +00:00
title = poobrains . storage . fields . CharField ( )
2017-05-29 02:49:56 +00:00
type = poobrains . storage . fields . CharField ( ) # TODO: We need some logic to make this useful. Also, build enum type compatible to sqlite+postgres?
author = poobrains . storage . fields . ForeignKeyField ( SourceOrganizationAuthor )
2017-03-17 02:25:55 +00:00
link = poobrains . storage . fields . ForeignKeyField ( ScoredLink , null = True )
2017-04-14 17:20:40 +00:00
description = poobrains . md . MarkdownField ( )
2017-03-12 22:43:46 +00:00
2017-03-17 02:25:55 +00:00
@app.expose ( ' /article/ ' , mode = ' full ' )
2017-03-15 00:13:29 +00:00
class Article ( poobrains . commenting . Commentable ) :
2017-03-12 22:43:46 +00:00
title = poobrains . storage . fields . CharField ( )
2017-04-14 17:20:40 +00:00
text = poobrains . md . MarkdownField ( )
@app.expose ( ' /projects/ ' , mode = ' full ' )
class Project ( poobrains . commenting . Commentable ) :
title = poobrains . storage . fields . CharField ( )
text = poobrains . md . MarkdownField ( )
link = poobrains . storage . fields . CharField ( )
2017-03-12 22:43:46 +00:00
2017-03-17 02:25:55 +00:00
@app.expose ( ' /curated/ ' , mode = ' full ' )
2017-03-15 00:13:29 +00:00
class CuratedContent ( poobrains . commenting . Commentable ) :
2017-03-12 22:43:46 +00:00
title = poobrains . storage . fields . CharField ( )
2017-04-14 17:20:40 +00:00
description = poobrains . md . MarkdownField ( )
2017-03-17 02:25:55 +00:00
link = poobrains . storage . fields . ForeignKeyField ( ScoredLink , null = True )
2017-03-16 16:56:17 +00:00
@app.site.box ( ' menu_main ' )
def menu_main ( ) :
menu = poobrains . rendering . Menu ( ' main ' )
try :
menu . append ( Article . url ( ' teaser ' ) , ' Articles ' )
2017-04-18 03:58:01 +00:00
except poobrains . auth . AccessDenied :
pass
try :
menu . append ( Project . url ( ' teaser ' ) , ' Projects ' )
except poobrains . auth . AccessDenied :
2017-03-16 16:56:17 +00:00
pass
try :
CuratedContent . url ( ' teaser ' )
menu . append ( CuratedContent . url ( ' teaser ' ) , ' Curated content ' )
2017-04-18 03:58:01 +00:00
except poobrains . auth . AccessDenied :
2017-03-16 16:56:17 +00:00
pass
try :
menu . append ( Source . url ( ' teaser ' ) , ' Sources ' )
2017-04-18 03:58:01 +00:00
except poobrains . auth . AccessDenied :
2017-03-16 16:56:17 +00:00
pass
2022-05-21 19:57:31 +00:00
for url , caption in poobrains . auth . Page . main_menu_entries ( ) :
menu . append ( url , caption )
2017-03-16 16:56:17 +00:00
return menu
2017-03-12 22:43:46 +00:00
2017-03-13 00:59:42 +00:00
DOGE = {
' prefix ' : [
' wow ' ,
' such ' ,
' many ' ,
' more ' ,
' so ' ,
' lol ' ,
' very ' ,
' omg '
] ,
' thing ' : [
' wow ' ,
' doge ' ,
' shibe ' ,
' 1337 h4xx0rz ' ,
' internet ' ,
' pretty ' ,
' computer ' ,
' free software ' ,
' website ' ,
' content ' ,
' python ' ,
' flask ' ,
' poobrains ' ,
' NOT PHP '
] ,
' thing_tls ' : [
' transport layer security ' ,
' X.509 ' ,
' certificate '
] ,
' suffix ' : [
' wow ' ,
' pls ' ,
' mystery ' ,
' anarchy ' ,
' bees '
]
}
@app.after_request
def mkdoge ( response ) :
items = [
DOGE [ ' prefix ' ] ,
DOGE [ ' thing ' ] + DOGE [ ' thing_tls ' ] if flask . request . is_secure else DOGE [ ' thing ' ] ,
DOGE [ ' suffix ' ]
]
doge = [ ]
for l in items :
doge . append ( l [ random . randint ( 0 , len ( l ) - 1 ) ] )
response . headers [ ' X-Doge ' ] = ' ' . join ( doge )
return response
2017-07-09 01:27:34 +00:00
## ##
## waffenfunde infoscraping things ##
## ##
2017-07-09 21:32:25 +00:00
MONITOR_PATTERNS = [ ' waffenfund ' , ' waffe gefunden ' , ' waffen gefunden ' ]
2017-07-09 01:27:34 +00:00
2018-01-04 15:22:15 +00:00
@poobrains.app.cron
def scrape_linkscores ( ) :
now = datetime . datetime . now ( )
period = datetime . timedelta ( days = 7 )
count = 0 # keep track of how many scores we actually update
2022-05-21 19:51:22 +00:00
with click . progressbar ( ScoredLink . select ( ) , label = " Update link scores where necessary " , item_show_func = lambda x = None : x . link if x else ' ' ) as links : # iterates through all non-abstract Models
for link in links :
2018-01-04 15:22:15 +00:00
if now - period > link . updated : # update at most once per `period`
link . save ( ) # ScoredLink scores are updated on every .save
count + = 1
2022-05-21 19:51:22 +00:00
click . secho ( f " Updated { count } link scores. " , fg = ' green ' )
2018-01-04 15:22:15 +00:00
2022-05-21 19:51:22 +00:00
#@poobrains.app.cron # bitrotted inactive research, disable
2017-07-09 01:27:34 +00:00
def scrape_blaulicht ( ) :
owner = poobrains . auth . User . get ( poobrains . auth . User . id == 2 )
for pattern in MONITOR_PATTERNS :
article_urls = [ ]
2017-07-09 21:32:25 +00:00
html = requests . get ( ' http://www.presseportal.de/blaulicht/suche.htx?q= %s ' % pattern , timeout = 30 ) . text
2018-04-05 20:53:17 +00:00
dom = bs4 . BeautifulSoup ( html , ' lxml ' )
2017-07-09 01:27:34 +00:00
2017-07-09 21:32:25 +00:00
click . echo ( " Beginning crawl of pagination for search pattern ' %s ' . " % pattern )
2017-07-09 01:27:34 +00:00
last_page = False
while not last_page :
next_page = dom . find ( attrs = { ' class ' : ' pagination-next ' } )
if next_page == None :
last_page = True
else :
# Why in the name of FUCK would you use spans with data-url for fucking links!?
next_page_url = ' http://www.presseportal.de/blaulicht/ %s ' % next_page [ ' data-url ' ]
for article in dom . find_all ( ' article ' ) :
2018-04-05 20:53:17 +00:00
try :
article_urls . append ( article . find ( ' h2 ' , attrs = { ' class ' : ' news-headline ' } ) . a [ ' href ' ] )
except Exception as e :
click . echo ( " Article appears to be without headline link, skipping " )
2017-07-09 01:27:34 +00:00
if not last_page :
2017-07-09 21:32:25 +00:00
click . echo ( " Next page: %s " % next_page_url )
2018-04-05 20:53:17 +00:00
dom = bs4 . BeautifulSoup ( requests . get ( next_page_url , timeout = 30 ) . text , ' lxml ' )
2017-07-09 01:27:34 +00:00
2017-07-09 21:32:25 +00:00
click . echo ( " URL collection done, found %d articles. " % len ( article_urls ) )
2017-07-09 01:27:34 +00:00
2017-07-09 21:32:25 +00:00
click . echo ( " Beginning crawl of individual articles. " )
2017-07-09 01:27:34 +00:00
for article_url in article_urls :
2017-07-09 21:32:25 +00:00
url = ' http://www.presseportal.de %s ' % article_url
try :
testlink = ScoredLink . get ( ScoredLink . link == url )
if Source . select ( ) . where ( Source . link == testlink ) . count ( ) :
click . echo ( " Already know source with link %s , skipping. " % url )
continue
except ScoredLink . DoesNotExist :
pass
try :
2018-04-05 20:53:17 +00:00
dom = bs4 . BeautifulSoup ( requests . get ( url , timeout = 30 ) . text , ' lxml ' )
2017-07-10 21:01:13 +00:00
except ( requests . exceptions . ConnectionError , requests . exceptions . ReadTimeout ) as e :
2019-02-10 14:56:21 +00:00
message = ' %s for %s : %s ' % ( type ( e ) . __name__ , url , str ( e ) )
2017-07-09 21:32:25 +00:00
click . echo ( message )
poobrains . app . logger . error ( message )
continue
2017-07-09 01:27:34 +00:00
2017-07-09 14:26:57 +00:00
try :
org_dom = dom . find ( ' h2 ' , attrs = { ' class ' : ' story-company ' } ) . a
except Exception as e :
2017-07-09 21:32:25 +00:00
message = " Couldn ' t extract source organization for %s " % url
click . echo ( message )
poobrains . app . logger . error ( message )
2017-07-09 14:26:57 +00:00
continue
2017-07-09 01:27:34 +00:00
try :
org = SourceOrganization . get ( SourceOrganization . title == org_dom . text )
except SourceOrganization . DoesNotExist :
2017-07-10 21:01:13 +00:00
org_url = ' http://www.presseportal.de %s ' % org_dom [ ' href ' ]
try :
org_link = ScoredLink . get ( ScoredLink . link == org_url )
except ScoredLink . DoesNotExist :
org_link = ScoredLink ( )
org_link . link = org_url
org_link . save ( )
2017-07-09 01:27:34 +00:00
org = SourceOrganization ( )
org . name = poobrains . helpers . clean_string ( org_dom . text )
org . title = org_dom . text
org . link = org_link
org . owner = owner
org . save ( )
try :
author = SourceAuthor . get ( SourceAuthor . name == org . name )
except SourceAuthor . DoesNotExist :
author = SourceAuthor ( )
author . name = org . name
author . title = org . title
author . link = org . link
author . owner = owner
author . save ( )
try :
orgauthor = SourceOrganizationAuthor . get ( SourceOrganizationAuthor . organization == org , SourceOrganizationAuthor . author == author )
except SourceOrganizationAuthor . DoesNotExist :
orgauthor = SourceOrganizationAuthor ( )
2018-04-06 05:29:31 +00:00
orgauthor . name = ' %s - %s ' % ( org . name , author . name )
2017-07-09 01:27:34 +00:00
orgauthor . organization = org
orgauthor . author = author
2018-04-06 05:29:31 +00:00
orgauthor . owner = owner
2017-07-09 01:27:34 +00:00
orgauthor . save ( )
try :
source_link = ScoredLink . get ( ScoredLink . link == url )
except ScoredLink . DoesNotExist :
source_link = ScoredLink ( )
source_link . link = url
source_link . save ( )
2017-07-09 21:32:25 +00:00
source_title = dom . find ( ' h1 ' , attrs = { ' class ' : ' story-headline ' } ) . text . strip ( )
2017-07-09 14:26:57 +00:00
source_name = poobrains . helpers . clean_string ( source_title )
2017-07-09 01:27:34 +00:00
try :
2017-07-09 14:26:57 +00:00
source = Source . get ( Source . name == source_name )
2017-07-10 17:26:41 +00:00
click . echo ( " Already have a source named %s . Probably indicates duplicate names. Current URL %s " % ( source_name , url ) )
2017-07-09 01:27:34 +00:00
except Source . DoesNotExist :
2017-07-10 23:35:30 +00:00
if poobrains . app . debug :
poobrains . app . debugger . set_trace ( )
date_string = dom . find ( attrs = { ' class ' : ' story-date ' } ) . text . strip ( ) . replace ( u ' \u2013 ' , ' - ' ) # \u2013 is a unicode dash
2017-07-10 21:01:13 +00:00
2017-07-09 01:27:34 +00:00
source = Source ( )
source . link = source_link
source . type = " scrape_blaulicht "
source . author = orgauthor
2017-07-09 14:26:57 +00:00
source . title = source_title
source . name = source_name
2017-07-10 23:35:30 +00:00
source . description = dom . find ( attrs = { ' class ' : ' story-text ' } ) . text . strip ( ) . replace ( ' < ' , ' < ' ) . replace ( ' > ' , ' > ' )
source . date = datetime . datetime . strptime ( date_string , " %d . % m. % Y - % H: % M " ) # format string contains a *dash*, not a minus!
2017-07-09 01:27:34 +00:00
source . owner = owner
source . save ( )
2017-07-09 21:32:25 +00:00
click . echo ( " Saved source: %s " % url )
2017-07-09 01:27:34 +00:00
2017-03-12 22:43:46 +00:00
if __name__ == ' __main__ ' :
2017-07-08 17:26:41 +00:00
app . cli ( )