From: David Adam (zanchey Date: Mon, 13 Jul 2009 13:21:19 +0000 (+0800) Subject: add feedcache module X-Git-Url: https://git.ucc.asn.au/?a=commitdiff_plain;h=8ede81c2494718e19dac6aa98f00fb53307ce6c6;p=planet-ucc.git add feedcache module --- diff --git a/Changelog b/Changelog index 494f999..2425196 100644 --- a/Changelog +++ b/Changelog @@ -1,3 +1,8 @@ +2009-07-13 David Adam + + * new cache method: imported feedcache 1.3.1 + (http://www.doughellmann.com/projects/feedcache/) + 2009-04-19 Davyd Madeley * (committed by zanchey@ucc as part of the update to git) diff --git a/extra/feedcache.py b/extra/feedcache.py new file mode 100644 index 0000000..00bc81b --- /dev/null +++ b/extra/feedcache.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python +# +# Copyright 2007 Doug Hellmann. +# +# +# All Rights Reserved +# +# Permission to use, copy, modify, and distribute this software and +# its documentation for any purpose and without fee is hereby +# granted, provided that the above copyright notice appear in all +# copies and that both that copyright notice and this permission +# notice appear in supporting documentation, and that the name of Doug +# Hellmann not be used in advertising or publicity pertaining to +# distribution of the software without specific, written prior +# permission. +# +# DOUG HELLMANN DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, +# INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN +# NO EVENT SHALL DOUG HELLMANN BE LIABLE FOR ANY SPECIAL, INDIRECT OR +# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, +# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# + +""" + +""" + +__module_id__ = "$Id$" + +# +# Import system modules +# +import feedparser + +import logging +import time + +# +# Import local modules +# + + +# +# Module +# + +logger = logging.getLogger('feedcache.cache') + +class Cache: + """A class to wrap Mark Pilgrim's Universal Feed Parser module + (http://www.feedparser.org) so that parameters can be used to + cache the feed results locally instead of fetching the feed every + time it is requested. Uses both etag and modified times for + caching. + """ + + def __init__(self, storage, timeToLiveSeconds=300, userAgent='feedcache'): + """ + Arguments: + + storage -- Backing store for the cache. It should follow + the dictionary API, with URLs used as keys. It should + persist data. + + timeToLiveSeconds=300 -- The length of time content should + live in the cache before an update is attempted. + + userAgent='feedcache' -- User agent string to be used when + fetching feed contents. + + """ + self.storage = storage + self.time_to_live = timeToLiveSeconds + self.user_agent = userAgent + return + + def purge(self, olderThanSeconds): + """Remove cached data from the storage if the data is older than the + date given. If olderThanSeconds is None, the entire cache is purged. + """ + if olderThanSeconds is None: + logger.debug('purging the entire cache') + for key in self.storage.keys(): + del self.storage[key] + else: + now = time.time() + # Iterate over the keys and load each item one at a time + # to avoid having the entire cache loaded into memory + # at one time. + for url in self.storage.keys(): + (cached_time, cached_data) = self.storage[url] + age = now - cached_time + if age >= olderThanSeconds: + logger.debug('removing %s with age %d', url, age) + del self.storage[url] + return + + def fetch(self, url, force_update = False, offline = False): + """Return the feed at url. + + url - The URL of the feed. + + force_update=False - When True, update the cache whether the + current contents have + exceeded their time-to-live + or not. + + offline=False - When True, only return data from the local + cache and never access the remote + URL. + + If there is data for that feed in the cache already, check + the expiration date before accessing the server. If the + cached data has not expired, return it without accessing the + server. + + In cases where the server is accessed, check for updates + before deciding what to return. If the server reports a + status of 304, the previously cached content is returned. + + The cache is only updated if the server returns a status of + 200, to avoid holding redirected data in the cache. + """ + logger.debug('url="%s"' % url) + + # Convert the URL to a value we can use + # as a key for the storage backend. + key = url + if isinstance( key, unicode): + key = key.encode('utf-8') + + modified = None + etag = None + now = time.time() + + cached_time, cached_content = self.storage.get(key, (None, None)) + + # Offline mode support (no networked requests) + # so return whatever we found in the storage. + # If there is nothing in the storage, we'll be returning None. + if offline: + logger.debug('offline mode') + return cached_content + + # Does the storage contain a version of the data + # which is older than the time-to-live? + logger.debug('cache modified time: %s' % str(cached_time)) + if cached_time is not None and not force_update: + if self.time_to_live: + age = now - cached_time + if age <= self.time_to_live: + logger.debug('cache contents still valid') + return cached_content + else: + logger.debug('cache contents older than TTL') + else: + logger.debug('no TTL value') + + # The cache is out of date, but we have + # something. Try to use the etag and modified_time + # values from the cached content. + etag = cached_content.get('etag') + modified = cached_content.get('modified') + logger.debug('cached etag=%s' % etag) + logger.debug('cached modified=%s' % str(modified)) + else: + logger.debug('nothing in the cache, or forcing update') + + # We know we need to fetch, so go ahead and do it. + logger.debug('fetching...') + parsed_result = feedparser.parse(url, + agent=self.user_agent, + modified=modified, + etag=etag, + ) + + status = parsed_result.get('status', None) + logger.debug('status=%s' % status) + if status == 304: + # No new data, based on the etag or modified values. + # We need to update the modified time in the + # storage, though, so we know that what we have + # stored is up to date. + self.storage[key] = (now, cached_content) + + # Return the data from the cache, since + # the parsed data will be empty. + parsed_result = cached_content + elif status == 200: + # There is new content, so store it unless there was an error. + error = parsed_result.get('bozo_exception') + if not error: + logger.debug('Updating stored data for %s' % url) + self.storage[key] = (now, parsed_result) + else: + logger.warning('Not storing data with exception: %s' % str(error)) + + return parsed_result +