3 # Copyright 2007 Doug Hellmann.
8 # Permission to use, copy, modify, and distribute this software and
9 # its documentation for any purpose and without fee is hereby
10 # granted, provided that the above copyright notice appear in all
11 # copies and that both that copyright notice and this permission
12 # notice appear in supporting documentation, and that the name of Doug
13 # Hellmann not be used in advertising or publicity pertaining to
14 # distribution of the software without specific, written prior
17 # DOUG HELLMANN DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
18 # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
19 # NO EVENT SHALL DOUG HELLMANN BE LIABLE FOR ANY SPECIAL, INDIRECT OR
20 # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
21 # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
22 # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
23 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
30 __module_id__ = "$Id$"
33 # Import system modules
41 # Import local modules
49 logger = logging.getLogger('feedcache.cache')
52 """A class to wrap Mark Pilgrim's Universal Feed Parser module
53 (http://www.feedparser.org) so that parameters can be used to
54 cache the feed results locally instead of fetching the feed every
55 time it is requested. Uses both etag and modified times for
59 def __init__(self, storage, timeToLiveSeconds=300, userAgent='feedcache'):
63 storage -- Backing store for the cache. It should follow
64 the dictionary API, with URLs used as keys. It should
67 timeToLiveSeconds=300 -- The length of time content should
68 live in the cache before an update is attempted.
70 userAgent='feedcache' -- User agent string to be used when
71 fetching feed contents.
74 self.storage = storage
75 self.time_to_live = timeToLiveSeconds
76 self.user_agent = userAgent
79 def purge(self, olderThanSeconds):
80 """Remove cached data from the storage if the data is older than the
81 date given. If olderThanSeconds is None, the entire cache is purged.
83 if olderThanSeconds is None:
84 logger.debug('purging the entire cache')
85 for key in self.storage.keys():
89 # Iterate over the keys and load each item one at a time
90 # to avoid having the entire cache loaded into memory
92 for url in self.storage.keys():
93 (cached_time, cached_data) = self.storage[url]
94 age = now - cached_time
95 if age >= olderThanSeconds:
96 logger.debug('removing %s with age %d', url, age)
100 def fetch(self, url, force_update = False, offline = False):
101 """Return the feed at url.
103 url - The URL of the feed.
105 force_update=False - When True, update the cache whether the
106 current contents have
107 exceeded their time-to-live
110 offline=False - When True, only return data from the local
111 cache and never access the remote
114 If there is data for that feed in the cache already, check
115 the expiration date before accessing the server. If the
116 cached data has not expired, return it without accessing the
119 In cases where the server is accessed, check for updates
120 before deciding what to return. If the server reports a
121 status of 304, the previously cached content is returned.
123 The cache is only updated if the server returns a status of
124 200, to avoid holding redirected data in the cache.
126 logger.debug('url="%s"' % url)
128 # Convert the URL to a value we can use
129 # as a key for the storage backend.
131 if isinstance( key, unicode):
132 key = key.encode('utf-8')
138 cached_time, cached_content = self.storage.get(key, (None, None))
140 # Offline mode support (no networked requests)
141 # so return whatever we found in the storage.
142 # If there is nothing in the storage, we'll be returning None.
144 logger.debug('offline mode')
145 return cached_content
147 # Does the storage contain a version of the data
148 # which is older than the time-to-live?
149 logger.debug('cache modified time: %s' % str(cached_time))
150 if cached_time is not None and not force_update:
151 if self.time_to_live:
152 age = now - cached_time
153 if age <= self.time_to_live:
154 logger.debug('cache contents still valid')
155 return cached_content
157 logger.debug('cache contents older than TTL')
159 logger.debug('no TTL value')
161 # The cache is out of date, but we have
162 # something. Try to use the etag and modified_time
163 # values from the cached content.
164 etag = cached_content.get('etag')
165 modified = cached_content.get('modified')
166 logger.debug('cached etag=%s' % etag)
167 logger.debug('cached modified=%s' % str(modified))
169 logger.debug('nothing in the cache, or forcing update')
171 # We know we need to fetch, so go ahead and do it.
172 logger.debug('fetching...')
173 parsed_result = feedparser.parse(url,
174 agent=self.user_agent,
179 status = parsed_result.get('status', None)
180 logger.debug('status=%s' % status)
182 # No new data, based on the etag or modified values.
183 # We need to update the modified time in the
184 # storage, though, so we know that what we have
185 # stored is up to date.
186 self.storage[key] = (now, cached_content)
188 # Return the data from the cache, since
189 # the parsed data will be empty.
190 parsed_result = cached_content
192 # There is new content, so store it unless there was an error.
193 error = parsed_result.get('bozo_exception')
195 logger.debug('Updating stored data for %s' % url)
196 self.storage[key] = (now, parsed_result)
198 logger.warning('Not storing data with exception: %s' % str(error))