| 1 | #␊ |
| 2 | # feed2omb - a tool for publishing atom/rss feeds to microblogging services␊ |
| 3 | # Copyright (C) 2008-2012, Ciaran Gultnieks␊ |
| 4 | #␊ |
| 5 | # Version 0.9.2␊ |
| 6 | #␊ |
| 7 | # This program is free software: you can redistribute it and/or modify␊ |
| 8 | # it under the terms of the GNU Affero General Public License as published by␊ |
| 9 | # the Free Software Foundation, either version 3 of the License, or␊ |
| 10 | # (at your option) any later version.␊ |
| 11 | #␊ |
| 12 | # This program is distributed in the hope that it will be useful,␊ |
| 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of␊ |
| 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the␊ |
| 15 | # GNU Affero General Public License for more details.␊ |
| 16 | #␊ |
| 17 | # You should have received a copy of the GNU Affero General Public License␊ |
| 18 | # along with this program. If not, see <http://www.gnu.org/licenses/>.␊ |
| 19 | ␊ |
| 20 | import sys␊ |
| 21 | import os␊ |
| 22 | ␊ |
| 23 | sys.path.append(os.path.join(sys.path[0], 'extlib/feedparser'))␊ |
| 24 | import feedparser␊ |
| 25 | ␊ |
| 26 | sys.path.append(os.path.join(sys.path[0], 'extlib/configobj'))␊ |
| 27 | from configobj import ConfigObj␊ |
| 28 | ␊ |
| 29 | import urllib2␊ |
| 30 | import re␊ |
| 31 | from datetime import datetime␊ |
| 32 | import time␊ |
| 33 | from urllib import urlencode␊ |
| 34 | from optparse import OptionParser␊ |
| 35 | ␊ |
| 36 | #Supressing all warnings, just to get rid of all the deprecation warnings␊ |
| 37 | #that are spewed out by xmpppy...␊ |
| 38 | import warnings␊ |
| 39 | warnings.simplefilter("ignore")␊ |
| 40 | ␊ |
| 41 | ␊ |
| 42 | #Get the author name for a particular entry␊ |
| 43 | def getauthor(entry):␊ |
| 44 | if ('source' in entry and 'author_detail' in entry.source and␊ |
| 45 | 'name' in entry.source.author_detail):␊ |
| 46 | return entry.source.author_detail.name␊ |
| 47 | if 'author_detail' in entry:␊ |
| 48 | if 'name' in entry.author_detail:␊ |
| 49 | return entry.author_detail.name␊ |
| 50 | return entry.author_detail␊ |
| 51 | if 'author' in entry:␊ |
| 52 | return entry.author␊ |
| 53 | return ""␊ |
| 54 | ␊ |
| 55 | #URL shorteners - each of these takes a URL and returns the␊ |
| 56 | #shortened version, along with the 'length' of the shortened␊ |
| 57 | #version. The length is quoted, and returned, because where␊ |
| 58 | #we allow the target OMB site to shorten for us, we don't␊ |
| 59 | #return the actual length here, but an assumed one.␊ |
| 60 | #The second parameter is the host to use for shortening, which␊ |
| 61 | #is relevant only for shortening types that require it.␊ |
| 62 | ␊ |
| 63 | ␊ |
| 64 | def shorten_bitly(url, host):␊ |
| 65 | try:␊ |
| 66 | biturl = ('http://api.bitly.com/v3/shorten?format=txt&longUrl='␊ |
| 67 | + url + '&apiKey=' + config['urlshortenkey'] +␊ |
| 68 | "&login=" + config['urlshortenlogin'])␊ |
| 69 | print 'Requesting short URL from "' + biturl + '"'␊ |
| 70 | bitly = urllib2.urlopen(biturl)␊ |
| 71 | shorturl = bitly.read()␊ |
| 72 | except:␊ |
| 73 | #Sometimes, bit.ly seems to refuse to give a result for␊ |
| 74 | #a seemlingly innocuous URL - this is a fallback for that␊ |
| 75 | #scenario...␊ |
| 76 | print 'Failed to get short URL'␊ |
| 77 | shorturl = '<no link>'␊ |
| 78 | return (shorturl, len(shorturl))␊ |
| 79 | ␊ |
| 80 | ␊ |
| 81 | def shorten_jmp(url, host):␊ |
| 82 | try:␊ |
| 83 | biturl = 'http://j.mp/api?url=' + url␊ |
| 84 | print 'Requesting short URL from "' + biturl + '"'␊ |
| 85 | bitly = urllib2.urlopen(biturl)␊ |
| 86 | shorturl = bitly.read()␊ |
| 87 | except:␊ |
| 88 | #Sometimes, j.mp seems to refuse to give a result for␊ |
| 89 | #a seemlingly innocuous URL - this is a fallback for that␊ |
| 90 | #scenario...␊ |
| 91 | print 'Failed to get short URL'␊ |
| 92 | shorturl = '<no link>'␊ |
| 93 | return (shorturl, len(shorturl))␊ |
| 94 | ␊ |
| 95 | ␊ |
| 96 | ␊ |
| 97 | def shorten_laconica(url, host):␊ |
| 98 | return (url, 22)␊ |
| 99 | ␊ |
| 100 | ␊ |
| 101 | def shorten_lilurl(url, host):␊ |
| 102 | try:␊ |
| 103 | if host is None:␊ |
| 104 | print "Configuration error - lilurl shortener requires a host"␊ |
| 105 | sys.exit(1)␊ |
| 106 | params = {'longurl': url}␊ |
| 107 | data = urlencode(params)␊ |
| 108 | req = urllib2.Request(host, data)␊ |
| 109 | response = urllib2.urlopen(req)␊ |
| 110 | result = response.read()␊ |
| 111 | #It's a hack, but I don't want to get involved in "which parser,␊ |
| 112 | #which dom, make sure you have these dependencies installed" just␊ |
| 113 | #to pull a tiny bit of text out of a bigger bit of text, so...␊ |
| 114 | index_start = result.find('href="')␊ |
| 115 | index_end = result.find('"', index_start + 6)␊ |
| 116 | if index_start == -1 or index_end == -1:␊ |
| 117 | raise Exception("Link not found")␊ |
| 118 | shorturl = result[index_start + 6: index_end]␊ |
| 119 | return (shorturl, len(shorturl))␊ |
| 120 | except:␊ |
| 121 | print 'Failed to get short URL'␊ |
| 122 | shorturl = '<no link>'␊ |
| 123 | return (shorturl, len(shorturl))␊ |
| 124 | ␊ |
| 125 | ␊ |
| 126 | def shorten_yourls(url, host):␊ |
| 127 | try:␊ |
| 128 | if host is None:␊ |
| 129 | print "Configuration error - yourls shortener requires a host"␊ |
| 130 | sys.exit(1)␊ |
| 131 | params = {'url': url, 'action': 'shorturl'}␊ |
| 132 | data = urlencode(params)␊ |
| 133 | req = urllib2.Request(host + '/index.php', data)␊ |
| 134 | response = urllib2.urlopen(req)␊ |
| 135 | result = response.read()␊ |
| 136 | #It's a hack, but I don't want to get involved in "which parser,␊ |
| 137 | #which dom, make sure you have these dependencies installed" just␊ |
| 138 | #to pull a tiny bit of text out of a bigger bit of text, so...␊ |
| 139 | index_start = result.find('<p>Short URL: <code><a href="')␊ |
| 140 | index_end = result.find('"', index_start + 29)␊ |
| 141 | if index_start == -1 or index_end == -1:␊ |
| 142 | raise Exception("Link not found")␊ |
| 143 | shorturl = result[index_start + 29: index_end]␊ |
| 144 | return (shorturl, len(shorturl))␊ |
| 145 | except:␊ |
| 146 | print 'Failed to get short URL'␊ |
| 147 | shorturl = '<no link>'␊ |
| 148 | return (shorturl, len(shorturl))␊ |
| 149 | ␊ |
| 150 | ␊ |
| 151 | def shorten_none(url, host):␊ |
| 152 | return (url, len(url))␊ |
| 153 | ␊ |
| 154 | ␊ |
| 155 | if sys.version_info < (2, 4):␊ |
| 156 | print "Python 2.4 or later is required."␊ |
| 157 | sys.exit(1)␊ |
| 158 | ␊ |
| 159 | ␊ |
| 160 | #Deal with the command line...␊ |
| 161 | parser = OptionParser()␊ |
| 162 | parser.add_option("-d", "--debug", action="store_true", default=False,␊ |
| 163 | help="Print debugging info on standard output")␊ |
| 164 | parser.add_option("-v", "--version", action="store_true", default=False,␊ |
| 165 | help="Display version and exit")␊ |
| 166 | parser.add_option("-u", "--update", action="store_true", default=False,␊ |
| 167 | help="Update the feeds using the config files specified")␊ |
| 168 | parser.add_option("-e", "--eat", action="store_true", default=False,␊ |
| 169 | help="Eat items found - i.e. mark as sent, but do not send")␊ |
| 170 | parser.add_option("-t", "--test", action="store_true", default=False,␊ |
| 171 | help="Test only - display local output but do not post to " +␊ |
| 172 | "omb or mark as sent")␊ |
| 173 | parser.add_option("-m", "--max", type="int", default=-1,␊ |
| 174 | help="Specify maximum number of items to process for each " +␊ |
| 175 | "feed - overrides 'maxpost' in individual config " +␊ |
| 176 | "files. Use 0 to post everything.")␊ |
| 177 | (options, args) = parser.parse_args()␊ |
| 178 | ␊ |
| 179 | if not (options.update or options.eat):␊ |
| 180 | print "Specify either --update or --eat to process feeds"␊ |
| 181 | sys.exit(1)␊ |
| 182 | if options.update and options.eat:␊ |
| 183 | print "You can't specify both --update and --eat"␊ |
| 184 | sys.exit(1)␊ |
| 185 | ␊ |
| 186 | if len(args) == 0:␊ |
| 187 | print "No config files specified - specify one or more config files " + \␊ |
| 188 | "to process"␊ |
| 189 | sys.exit(1)␊ |
| 190 | ␊ |
| 191 | #Redirect output to log file in current directory unless told otherwise␊ |
| 192 | savout = sys.stdout␊ |
| 193 | if not (options.debug or options.eat or options.test or options.version):␊ |
| 194 | of = open('feed2omb.log', 'a')␊ |
| 195 | sys.stdout = of␊ |
| 196 | ␊ |
| 197 | if options.version:␊ |
| 198 | print "feed2omb version 0.9.2\nCopyright 2008-12 Ciaran Gultnieks"␊ |
| 199 | sys.exit(0)␊ |
| 200 | ␊ |
| 201 | #Set user agent for the feed parser...␊ |
| 202 | feedparser.USER_AGENT = "feed2omb/0.9.2 +http://projects.ciarang.com/p/feed2omb/"␊ |
| 203 | ␊ |
| 204 | for thisconfig in args:␊ |
| 205 | ␊ |
| 206 | print "Reading config: " + thisconfig␊ |
| 207 | config = ConfigObj(thisconfig, file_error=True)␊ |
| 208 | ␊ |
| 209 | print 'Reading feed...'␊ |
| 210 | feed = feedparser.parse(config['feedurl'])␊ |
| 211 | ␊ |
| 212 | done = 0␊ |
| 213 | ␊ |
| 214 | #Determine message mode...␊ |
| 215 | if 'msgmode' in config:␊ |
| 216 | msgmode = config['msgmode']␊ |
| 217 | else:␊ |
| 218 | msgmode = 'title'␊ |
| 219 | ␊ |
| 220 | #Determine maximum message length...␊ |
| 221 | maxlen = 140␊ |
| 222 | if 'maxlen' in config:␊ |
| 223 | maxlen = int(config['maxlen'])␊ |
| 224 | ␊ |
| 225 | #Notice source␊ |
| 226 | source = 'feed2omb'␊ |
| 227 | if 'source' in config:␊ |
| 228 | source = config['source']␊ |
| 229 | ␊ |
| 230 | #Determine maximum items to post (for this feed - command-line --max can␊ |
| 231 | #override...␊ |
| 232 | if 'maxpost' in config:␊ |
| 233 | maxpost = int(config['maxpost'])␊ |
| 234 | else:␊ |
| 235 | maxpost = 2␊ |
| 236 | ␊ |
| 237 | #Determine if we are including links with the message...␊ |
| 238 | if 'includelinks' in config and config['includelinks'] == 'no':␊ |
| 239 | includelinks = False␊ |
| 240 | else:␊ |
| 241 | includelinks = True␊ |
| 242 | ␊ |
| 243 | #Determine sent mode... (i.e. how we decide if we've already sent an entry)␊ |
| 244 | if 'sentmode' in config:␊ |
| 245 | sentmode = config['sentmode']␊ |
| 246 | else:␊ |
| 247 | sentmode = 'sentlinks'␊ |
| 248 | if sentmode == 'timestamp':␊ |
| 249 | if 'lastsent' in config:␊ |
| 250 | lastsent = datetime(*time.strptime(config['lastsent'],␊ |
| 251 | "%Y-%m-%d %H:%M:%S")[0:6])␊ |
| 252 | else:␊ |
| 253 | lastsent = datetime.min␊ |
| 254 | ␊ |
| 255 | #Determine url shortening mode...␊ |
| 256 | if 'urlshortener' in config:␊ |
| 257 | urlshortener = config['urlshortener']␊ |
| 258 | if 'urlshortenhost' in config:␊ |
| 259 | urlshortenhost = config['urlshortenhost']␊ |
| 260 | else:␊ |
| 261 | urlshortenhost = None␊ |
| 262 | else:␊ |
| 263 | urlshortener = 'lilurl'␊ |
| 264 | urlshortenhost = 'http://ur1.ca'␊ |
| 265 | if 'shortenalways' in config and config['shortenalways'] == 'yes':␊ |
| 266 | shortenalways = True␊ |
| 267 | else:␊ |
| 268 | shortenalways = False␊ |
| 269 | ␊ |
| 270 | #If we've been told to use a lilurl-based shortening host, make sure␊ |
| 271 | #we've been told which one...␊ |
| 272 | if urlshortener == 'lilurl' and urlshortenhost is None:␊ |
| 273 | print "Host must be specified for lilurl-based shortener"␊ |
| 274 | sys.exit(1)␊ |
| 275 | ␊ |
| 276 | #If we've been told to use bit.ly, make sure we have an API key...␊ |
| 277 | if urlshortener == 'bit.ly' and (not config.has_key('urlshortenkey') or␊ |
| 278 | not config.has_key('urlshortenlogin')):␊ |
| 279 | print "Login and API key must be specified for bit.ly"␊ |
| 280 | print "Option one - register, get details, put in config file"␊ |
| 281 | print "Option two - use a different shortener"␊ |
| 282 | sys.exit(1)␊ |
| 283 | ␊ |
| 284 | #Determine hashtag mode...␊ |
| 285 | if 'hashtags' in config:␊ |
| 286 | hashtags = config['hashtags']␊ |
| 287 | else:␊ |
| 288 | hashtags = 'none'␊ |
| 289 | ␊ |
| 290 | #See if we are going to apply one or more regular expressions to␊ |
| 291 | #the messages. When we're done, we'll have two lists, msgregex being␊ |
| 292 | #all the precompiled regular expressions, and msgreplace being their␊ |
| 293 | #corresponding replacement strings.␊ |
| 294 | msgregex = []␊ |
| 295 | msgreplace = []␊ |
| 296 | if 'messageregex' in config and 'messagereplace' in config:␊ |
| 297 | creg = config.as_list('messageregex')␊ |
| 298 | crep = config.as_list('messagereplace')␊ |
| 299 | if len(creg) != len(crep):␊ |
| 300 | print "You must give the same number of regular expressions " + \␊ |
| 301 | "and replacements"␊ |
| 302 | sys.exit(1)␊ |
| 303 | for i in range(len(creg)):␊ |
| 304 | msgregex.append(re.compile(creg[i]))␊ |
| 305 | msgreplace.append(crep[i])␊ |
| 306 | ␊ |
| 307 | #Finally we get to actually process the feed entries...␊ |
| 308 | for entry in reversed(feed.entries):␊ |
| 309 | ␊ |
| 310 | #Decide if this is a new entry or one we've already sent...␊ |
| 311 | isnew = False␊ |
| 312 | if sentmode == 'timestamp':␊ |
| 313 | t_year, t_month, t_day, t_hour, \␊ |
| 314 | t_minute, t_second, t_x, t_x1, t_x2 = entry.updated_parsed␊ |
| 315 | thissent = datetime(t_year, t_month, t_day, t_hour,␊ |
| 316 | t_minute, t_second)␊ |
| 317 | if lastsent < thissent:␊ |
| 318 | isnew = True␊ |
| 319 | else:␊ |
| 320 | if not "'" + entry.link + "'" in config['sentlinks']:␊ |
| 321 | isnew = True␊ |
| 322 | ␊ |
| 323 | if isnew:␊ |
| 324 | print 'Found new entry: ' + entry.link␊ |
| 325 | ␊ |
| 326 | #Shorten the URL...␊ |
| 327 | if includelinks:␊ |
| 328 | longurl = entry.link␊ |
| 329 | shorturl, urllen = {'bit.ly': shorten_bitly,␊ |
| 330 | 'j.mp': shorten_jmp,␊ |
| 331 | 'lilurl': shorten_lilurl,␊ |
| 332 | 'laconica': shorten_laconica,␊ |
| 333 | 'yourls': shorten_yourls,␊ |
| 334 | 'none': shorten_none} \␊ |
| 335 | [urlshortener](longurl, urlshortenhost)␊ |
| 336 | else:␊ |
| 337 | urllen = 0␊ |
| 338 | ␊ |
| 339 | #See how much space we have left once the URL is there:␊ |
| 340 | charsleft = maxlen␊ |
| 341 | if urllen > 0 and includelinks:␊ |
| 342 | #We will be adding " - " as well as the URL␊ |
| 343 | charsleft -= 3 + urllen␊ |
| 344 | ␊ |
| 345 | if msgmode == 'authtitle':␊ |
| 346 | text = getauthor(entry) + ' - ' + entry.title␊ |
| 347 | elif msgmode == 'summary' or msgmode == 'authsummary':␊ |
| 348 | if 'summary' in entry:␊ |
| 349 | text = entry.summary␊ |
| 350 | else:␊ |
| 351 | text = entry.title␊ |
| 352 | if msgmode == 'authsummary':␊ |
| 353 | text = getauthor(entry) + ' - ' + text␊ |
| 354 | else:␊ |
| 355 | text = entry.title␊ |
| 356 | ␊ |
| 357 | #Apply regular expression search/replaces to the message body if␊ |
| 358 | #requested...␊ |
| 359 | for i in range(len(msgregex)):␊ |
| 360 | text = msgregex[i].sub(msgreplace[i], text)␊ |
| 361 | ␊ |
| 362 | #Truncate the message text if necessary...␊ |
| 363 | if len(text) > charsleft:␊ |
| 364 | text = text[:charsleft-3] + '...'␊ |
| 365 | ␊ |
| 366 | #Append the url. Don't bother using the shortened one if the full␊ |
| 367 | #one fits...␊ |
| 368 | if includelinks:␊ |
| 369 | text += ' - '␊ |
| 370 | if not shortenalways and len(text + longurl) < maxlen:␊ |
| 371 | text += longurl␊ |
| 372 | else:␊ |
| 373 | text += shorturl␊ |
| 374 | ␊ |
| 375 | #Add hashtags from categories if that mode is enabled...␊ |
| 376 | if hashtags == 'category':␊ |
| 377 | if 'categories' in entry:␊ |
| 378 | cats = entry.categories␊ |
| 379 | for cat in cats:␊ |
| 380 | (dontcare, cattxt) = cat␊ |
| 381 | cattxt = ' #' + cattxt␊ |
| 382 | if len(text + cattxt) < maxlen:␊ |
| 383 | text += cattxt␊ |
| 384 | ␊ |
| 385 | #Some console output to describe what's going on...␊ |
| 386 | if options.test:␊ |
| 387 | if options.eat:␊ |
| 388 | print 'Eaten message would be:'␊ |
| 389 | else:␊ |
| 390 | print 'Sent message would be:'␊ |
| 391 | else:␊ |
| 392 | if options.eat:␊ |
| 393 | print 'Eating new message:'␊ |
| 394 | else:␊ |
| 395 | print 'Sending new message:'␊ |
| 396 | if sys.stdout.encoding is not None:␊ |
| 397 | print ' ' + text.encode(sys.stdout.encoding, 'replace')␊ |
| 398 | else:␊ |
| 399 | print ' <message hidden - output encoding cannot be ' + \␊ |
| 400 | 'determined>'␊ |
| 401 | ␊ |
| 402 | #Actually send the message, if that's what we're supposed to be␊ |
| 403 | #doing...␊ |
| 404 | if not options.test:␊ |
| 405 | if options.update:␊ |
| 406 | ␊ |
| 407 | #OMB API send...␊ |
| 408 | if 'apibaseurl' in config and config['apibaseurl'] != "":␊ |
| 409 | passwordmgr = urllib2.HTTPPasswordMgrWithDefaultRealm()␊ |
| 410 | passwordmgr.add_password(None, config['apibaseurl'],␊ |
| 411 | config['user'], config['password'])␊ |
| 412 | handler = urllib2.HTTPBasicAuthHandler(passwordmgr)␊ |
| 413 | opener = urllib2.build_opener(handler)␊ |
| 414 | data = {'status': text.encode('utf-8'),␊ |
| 415 | 'source': source}␊ |
| 416 | resp = opener.open(config['apibaseurl'] + \␊ |
| 417 | '/statuses/update.xml', urlencode(data))␊ |
| 418 | resp.close()␊ |
| 419 | ␊ |
| 420 | #XMPP send...␊ |
| 421 | if 'xmpp_server' in config and config['xmpp_server'] != "":␊ |
| 422 | import xmpp␊ |
| 423 | #Note that we connect and disconnect for each message␊ |
| 424 | #currently!␊ |
| 425 | ␊ |
| 426 | jid = xmpp.protocol.JID(config['xmpp_jid'])␊ |
| 427 | client = xmpp.Client(jid.getDomain(), debug=[])␊ |
| 428 | con = client.connect()␊ |
| 429 | client.auth(jid.getNode(), config['xmpp_password'],␊ |
| 430 | resource="feed2omb")␊ |
| 431 | client.send(xmpp.protocol.Message(config['xmpp_to'],␊ |
| 432 | text))␊ |
| 433 | ␊ |
| 434 | #Record that we have sent this entry...␊ |
| 435 | if sentmode == 'timestamp':␊ |
| 436 | lastsent = thissent␊ |
| 437 | config['lastsent'] = lastsent.strftime("%Y-%m-%d %H:%M:%S")␊ |
| 438 | else:␊ |
| 439 | config['sentlinks']["'" + entry.link + "'"] = 'sent'␊ |
| 440 | ␊ |
| 441 | #Rewrite the config after each link to avoid double-posting if␊ |
| 442 | #something goes wrong.␊ |
| 443 | if not options.test:␊ |
| 444 | config.write()␊ |
| 445 | ␊ |
| 446 | #Keep track of how many items we've posted and stop if we reach the␊ |
| 447 | #requested limit␊ |
| 448 | done += 1␊ |
| 449 | thismax = options.max␊ |
| 450 | if options.max == -1:␊ |
| 451 | thismax = maxpost␊ |
| 452 | if thismax > 0 and done >= thismax:␊ |
| 453 | print "Reached requested limit"␊ |
| 454 | break␊ |
| 455 | ␊ |
| 456 | print 'Finished'␊ |
| 457 | ␊ |
| 458 | sys.stdout = savout␊ |
| 459 | |