feed2omb

feed2omb Git Source Tree

Root/feed2omb.py

1#
2# feed2omb - a tool for publishing atom/rss feeds to microblogging services
3# Copyright (C) 2008-2012, Ciaran Gultnieks
4#
5# Version 0.9.2
6#
7# This program is free software: you can redistribute it and/or modify
8# it under the terms of the GNU Affero General Public License as published by
9# the Free Software Foundation, either version 3 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU Affero General Public License for more details.
16#
17# You should have received a copy of the GNU Affero General Public License
18# along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20import sys
21import os
22
23sys.path.append(os.path.join(sys.path[0], 'extlib/feedparser'))
24import feedparser
25
26sys.path.append(os.path.join(sys.path[0], 'extlib/configobj'))
27from configobj import ConfigObj
28
29import urllib2
30import re
31from datetime import datetime
32import time
33from urllib import urlencode
34from optparse import OptionParser
35
36#Supressing all warnings, just to get rid of all the deprecation warnings
37#that are spewed out by xmpppy...
38import warnings
39warnings.simplefilter("ignore")
40
41
42#Get the author name for a particular entry
43def getauthor(entry):
44 if ('source' in entry and 'author_detail' in entry.source and
45 'name' in entry.source.author_detail):
46 return entry.source.author_detail.name
47 if 'author_detail' in entry:
48 if 'name' in entry.author_detail:
49 return entry.author_detail.name
50 return entry.author_detail
51 if 'author' in entry:
52 return entry.author
53 return ""
54
55#URL shorteners - each of these takes a URL and returns the
56#shortened version, along with the 'length' of the shortened
57#version. The length is quoted, and returned, because where
58#we allow the target OMB site to shorten for us, we don't
59#return the actual length here, but an assumed one.
60#The second parameter is the host to use for shortening, which
61#is relevant only for shortening types that require it.
62
63
64def shorten_bitly(url, host):
65 try:
66 biturl = ('http://api.bitly.com/v3/shorten?format=txt&longUrl='
67 + url + '&apiKey=' + config['urlshortenkey'] +
68 "&login=" + config['urlshortenlogin'])
69 print 'Requesting short URL from "' + biturl + '"'
70 bitly = urllib2.urlopen(biturl)
71 shorturl = bitly.read()
72 except:
73 #Sometimes, bit.ly seems to refuse to give a result for
74 #a seemlingly innocuous URL - this is a fallback for that
75 #scenario...
76 print 'Failed to get short URL'
77 shorturl = '<no link>'
78 return (shorturl, len(shorturl))
79
80
81def shorten_jmp(url, host):
82 try:
83 biturl = 'http://j.mp/api?url=' + url
84 print 'Requesting short URL from "' + biturl + '"'
85 bitly = urllib2.urlopen(biturl)
86 shorturl = bitly.read()
87 except:
88 #Sometimes, j.mp seems to refuse to give a result for
89 #a seemlingly innocuous URL - this is a fallback for that
90 #scenario...
91 print 'Failed to get short URL'
92 shorturl = '<no link>'
93 return (shorturl, len(shorturl))
94
95
96
97def shorten_laconica(url, host):
98 return (url, 22)
99
100
101def shorten_lilurl(url, host):
102 try:
103 if host is None:
104 print "Configuration error - lilurl shortener requires a host"
105 sys.exit(1)
106 params = {'longurl': url}
107 data = urlencode(params)
108 req = urllib2.Request(host, data)
109 response = urllib2.urlopen(req)
110 result = response.read()
111 #It's a hack, but I don't want to get involved in "which parser,
112 #which dom, make sure you have these dependencies installed" just
113 #to pull a tiny bit of text out of a bigger bit of text, so...
114 index_start = result.find('href="')
115 index_end = result.find('"', index_start + 6)
116 if index_start == -1 or index_end == -1:
117 raise Exception("Link not found")
118 shorturl = result[index_start + 6: index_end]
119 return (shorturl, len(shorturl))
120 except:
121 print 'Failed to get short URL'
122 shorturl = '<no link>'
123 return (shorturl, len(shorturl))
124
125
126def shorten_yourls(url, host):
127 try:
128 if host is None:
129 print "Configuration error - yourls shortener requires a host"
130 sys.exit(1)
131 params = {'url': url, 'action': 'shorturl'}
132 data = urlencode(params)
133 req = urllib2.Request(host + '/index.php', data)
134 response = urllib2.urlopen(req)
135 result = response.read()
136 #It's a hack, but I don't want to get involved in "which parser,
137 #which dom, make sure you have these dependencies installed" just
138 #to pull a tiny bit of text out of a bigger bit of text, so...
139 index_start = result.find('<p>Short URL: <code><a href="')
140 index_end = result.find('"', index_start + 29)
141 if index_start == -1 or index_end == -1:
142 raise Exception("Link not found")
143 shorturl = result[index_start + 29: index_end]
144 return (shorturl, len(shorturl))
145 except:
146 print 'Failed to get short URL'
147 shorturl = '<no link>'
148 return (shorturl, len(shorturl))
149
150
151def shorten_none(url, host):
152 return (url, len(url))
153
154
155if sys.version_info < (2, 4):
156 print "Python 2.4 or later is required."
157 sys.exit(1)
158
159
160#Deal with the command line...
161parser = OptionParser()
162parser.add_option("-d", "--debug", action="store_true", default=False,
163 help="Print debugging info on standard output")
164parser.add_option("-v", "--version", action="store_true", default=False,
165 help="Display version and exit")
166parser.add_option("-u", "--update", action="store_true", default=False,
167 help="Update the feeds using the config files specified")
168parser.add_option("-e", "--eat", action="store_true", default=False,
169 help="Eat items found - i.e. mark as sent, but do not send")
170parser.add_option("-t", "--test", action="store_true", default=False,
171 help="Test only - display local output but do not post to " +
172 "omb or mark as sent")
173parser.add_option("-m", "--max", type="int", default=-1,
174 help="Specify maximum number of items to process for each " +
175 "feed - overrides 'maxpost' in individual config " +
176 "files. Use 0 to post everything.")
177(options, args) = parser.parse_args()
178
179if not (options.update or options.eat):
180 print "Specify either --update or --eat to process feeds"
181 sys.exit(1)
182if options.update and options.eat:
183 print "You can't specify both --update and --eat"
184 sys.exit(1)
185
186if len(args) == 0:
187 print "No config files specified - specify one or more config files " + \
188 "to process"
189 sys.exit(1)
190
191#Redirect output to log file in current directory unless told otherwise
192savout = sys.stdout
193if not (options.debug or options.eat or options.test or options.version):
194 of = open('feed2omb.log', 'a')
195 sys.stdout = of
196
197if options.version:
198 print "feed2omb version 0.9.2\nCopyright 2008-12 Ciaran Gultnieks"
199 sys.exit(0)
200
201#Set user agent for the feed parser...
202feedparser.USER_AGENT = "feed2omb/0.9.2 +http://projects.ciarang.com/p/feed2omb/"
203
204for thisconfig in args:
205
206 print "Reading config: " + thisconfig
207 config = ConfigObj(thisconfig, file_error=True)
208
209 print 'Reading feed...'
210 feed = feedparser.parse(config['feedurl'])
211
212 done = 0
213
214 #Determine message mode...
215 if 'msgmode' in config:
216 msgmode = config['msgmode']
217 else:
218 msgmode = 'title'
219
220 #Determine maximum message length...
221 maxlen = 140
222 if 'maxlen' in config:
223 maxlen = int(config['maxlen'])
224
225 #Notice source
226 source = 'feed2omb'
227 if 'source' in config:
228 source = config['source']
229
230 #Determine maximum items to post (for this feed - command-line --max can
231 #override...
232 if 'maxpost' in config:
233 maxpost = int(config['maxpost'])
234 else:
235 maxpost = 2
236
237 #Determine if we are including links with the message...
238 if 'includelinks' in config and config['includelinks'] == 'no':
239 includelinks = False
240 else:
241 includelinks = True
242
243 #Determine sent mode... (i.e. how we decide if we've already sent an entry)
244 if 'sentmode' in config:
245 sentmode = config['sentmode']
246 else:
247 sentmode = 'sentlinks'
248 if sentmode == 'timestamp':
249 if 'lastsent' in config:
250 lastsent = datetime(*time.strptime(config['lastsent'],
251 "%Y-%m-%d %H:%M:%S")[0:6])
252 else:
253 lastsent = datetime.min
254
255 #Determine url shortening mode...
256 if 'urlshortener' in config:
257 urlshortener = config['urlshortener']
258 if 'urlshortenhost' in config:
259 urlshortenhost = config['urlshortenhost']
260 else:
261 urlshortenhost = None
262 else:
263 urlshortener = 'lilurl'
264 urlshortenhost = 'http://ur1.ca'
265 if 'shortenalways' in config and config['shortenalways'] == 'yes':
266 shortenalways = True
267 else:
268 shortenalways = False
269
270 #If we've been told to use a lilurl-based shortening host, make sure
271 #we've been told which one...
272 if urlshortener == 'lilurl' and urlshortenhost is None:
273 print "Host must be specified for lilurl-based shortener"
274 sys.exit(1)
275
276 #If we've been told to use bit.ly, make sure we have an API key...
277 if urlshortener == 'bit.ly' and (not config.has_key('urlshortenkey') or
278 not config.has_key('urlshortenlogin')):
279 print "Login and API key must be specified for bit.ly"
280 print "Option one - register, get details, put in config file"
281 print "Option two - use a different shortener"
282 sys.exit(1)
283
284 #Determine hashtag mode...
285 if 'hashtags' in config:
286 hashtags = config['hashtags']
287 else:
288 hashtags = 'none'
289
290 #See if we are going to apply one or more regular expressions to
291 #the messages. When we're done, we'll have two lists, msgregex being
292 #all the precompiled regular expressions, and msgreplace being their
293 #corresponding replacement strings.
294 msgregex = []
295 msgreplace = []
296 if 'messageregex' in config and 'messagereplace' in config:
297 creg = config.as_list('messageregex')
298 crep = config.as_list('messagereplace')
299 if len(creg) != len(crep):
300 print "You must give the same number of regular expressions " + \
301 "and replacements"
302 sys.exit(1)
303 for i in range(len(creg)):
304 msgregex.append(re.compile(creg[i]))
305 msgreplace.append(crep[i])
306
307 #Finally we get to actually process the feed entries...
308 for entry in reversed(feed.entries):
309
310 #Decide if this is a new entry or one we've already sent...
311 isnew = False
312 if sentmode == 'timestamp':
313 t_year, t_month, t_day, t_hour, \
314 t_minute, t_second, t_x, t_x1, t_x2 = entry.updated_parsed
315 thissent = datetime(t_year, t_month, t_day, t_hour,
316 t_minute, t_second)
317 if lastsent < thissent:
318 isnew = True
319 else:
320 if not "'" + entry.link + "'" in config['sentlinks']:
321 isnew = True
322
323 if isnew:
324 print 'Found new entry: ' + entry.link
325
326 #Shorten the URL...
327 if includelinks:
328 longurl = entry.link
329 shorturl, urllen = {'bit.ly': shorten_bitly,
330 'j.mp': shorten_jmp,
331 'lilurl': shorten_lilurl,
332 'laconica': shorten_laconica,
333 'yourls': shorten_yourls,
334 'none': shorten_none} \
335 [urlshortener](longurl, urlshortenhost)
336 else:
337 urllen = 0
338
339 #See how much space we have left once the URL is there:
340 charsleft = maxlen
341 if urllen > 0 and includelinks:
342 #We will be adding " - " as well as the URL
343 charsleft -= 3 + urllen
344
345 if msgmode == 'authtitle':
346 text = getauthor(entry) + ' - ' + entry.title
347 elif msgmode == 'summary' or msgmode == 'authsummary':
348 if 'summary' in entry:
349 text = entry.summary
350 else:
351 text = entry.title
352 if msgmode == 'authsummary':
353 text = getauthor(entry) + ' - ' + text
354 else:
355 text = entry.title
356
357 #Apply regular expression search/replaces to the message body if
358 #requested...
359 for i in range(len(msgregex)):
360 text = msgregex[i].sub(msgreplace[i], text)
361
362 #Truncate the message text if necessary...
363 if len(text) > charsleft:
364 text = text[:charsleft-3] + '...'
365
366 #Append the url. Don't bother using the shortened one if the full
367 #one fits...
368 if includelinks:
369 text += ' - '
370 if not shortenalways and len(text + longurl) < maxlen:
371 text += longurl
372 else:
373 text += shorturl
374
375 #Add hashtags from categories if that mode is enabled...
376 if hashtags == 'category':
377 if 'categories' in entry:
378 cats = entry.categories
379 for cat in cats:
380 (dontcare, cattxt) = cat
381 cattxt = ' #' + cattxt
382 if len(text + cattxt) < maxlen:
383 text += cattxt
384
385 #Some console output to describe what's going on...
386 if options.test:
387 if options.eat:
388 print 'Eaten message would be:'
389 else:
390 print 'Sent message would be:'
391 else:
392 if options.eat:
393 print 'Eating new message:'
394 else:
395 print 'Sending new message:'
396 if sys.stdout.encoding is not None:
397 print ' ' + text.encode(sys.stdout.encoding, 'replace')
398 else:
399 print ' <message hidden - output encoding cannot be ' + \
400 'determined>'
401
402 #Actually send the message, if that's what we're supposed to be
403 #doing...
404 if not options.test:
405 if options.update:
406
407 #OMB API send...
408 if 'apibaseurl' in config and config['apibaseurl'] != "":
409 passwordmgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
410 passwordmgr.add_password(None, config['apibaseurl'],
411 config['user'], config['password'])
412 handler = urllib2.HTTPBasicAuthHandler(passwordmgr)
413 opener = urllib2.build_opener(handler)
414 data = {'status': text.encode('utf-8'),
415 'source': source}
416 resp = opener.open(config['apibaseurl'] + \
417 '/statuses/update.xml', urlencode(data))
418 resp.close()
419
420 #XMPP send...
421 if 'xmpp_server' in config and config['xmpp_server'] != "":
422 import xmpp
423 #Note that we connect and disconnect for each message
424 #currently!
425
426 jid = xmpp.protocol.JID(config['xmpp_jid'])
427 client = xmpp.Client(jid.getDomain(), debug=[])
428 con = client.connect()
429 client.auth(jid.getNode(), config['xmpp_password'],
430 resource="feed2omb")
431 client.send(xmpp.protocol.Message(config['xmpp_to'],
432 text))
433
434 #Record that we have sent this entry...
435 if sentmode == 'timestamp':
436 lastsent = thissent
437 config['lastsent'] = lastsent.strftime("%Y-%m-%d %H:%M:%S")
438 else:
439 config['sentlinks']["'" + entry.link + "'"] = 'sent'
440
441 #Rewrite the config after each link to avoid double-posting if
442 #something goes wrong.
443 if not options.test:
444 config.write()
445
446 #Keep track of how many items we've posted and stop if we reach the
447 #requested limit
448 done += 1
449 thismax = options.max
450 if options.max == -1:
451 thismax = maxpost
452 if thismax > 0 and done >= thismax:
453 print "Reached requested limit"
454 break
455
456print 'Finished'
457
458sys.stdout = savout
459

Archive Download this file

Branches

Tags