module FeedFetcher #:nodoc: class FeedSourceError < RuntimeError end class NoFeedForPageError < RuntimeError end class PageFeedError < RuntimeError end class PageDoesntExistError < RuntimeError end class InvalidURIError < RuntimeError end class FeedFetcher # Check if the url passed is a feed links or contains a feed link # returns diferents object depends of what option was passed # # option: # * :only_verify : return true if is a feed url or the url page contains a feed url link # * :feed_url :return a feed url if is a feed url or a feed url link if it's contatined in the url page # * :xml : return the xml feed and the feed url in this form: {:file => file , :feed_url => feed_url} def self.is_feed?(page_url , option) doc = get_doc(page_url) if doc.nil? false elsif is_xml_feed? doc case option when :only_verify then return true when :feed_url then return page_url when :xml then return { :file => @xml, :feed_url => page_url} end elsif is_html_page? doc if html_has_feed_link? doc page_url = feed_link_from_html(doc) doc = get_doc(page_url) if doc.nil? false elsif is_xml_feed? doc case option when :only_verify then return true when :feed_url then return page_url when :xml then return { :file => @xml, :feed_url => page_url} end else false end else nil end else nil end end #Parse the feed_url and return the feed model # #option: #* :feed_url (set with this value when you pass the feed url) #* :xml (set with this value when you pass the hash with the xml feed file and feed url) def self.parse (url_or_xml , option = :feed_url) case option when :feed_url then feed_xml = fetch_page(url_or_xml) when :xml then feed_xml = url_or_xml end return nil unless feed_xml require 'feed-normalizer' FeedNormalizer::FeedNormalizer.parse feed_xml end # Return a file from the url given as a param def self.fetch_page(site_url_to_open) begin require 'open-uri' return open(site_url_to_open) rescue Errno::ENOENT raise PageDoesntExistError rescue SocketError raise InvalidURIError rescue OpenURI::HTTPError raise PageDoesntExistError end end protected # Return true if doc is a xml feed RSS or ATOM. def self.is_xml_feed?(doc) if !doc.search("//rss").empty? || !doc.search("//rdf:rdf").empty? return @feed_type = 'RSS' elsif !doc.search("//feed").empty? return @feed_type = 'ATOM' else return false end end # Return if doc is a html file def self.is_html_page?(doc) !doc.search("//html").empty? end # Return if doc is conatins a xml feed link RSS or ATOM. def self.html_has_feed_link?(doc) feed_link = doc.search("//html/head/link[@type='application/rss+xml']") || doc.search("//html/head/link[@type='application/atom+xml']") return !feed_link.blank? end # Return the feed link from doc html file, else return nil. def self.feed_link_from_html(doc) feed_link = doc.search("//html/head/link[@type='application/rss+xml']") || doc.search("//html/head/link[@type='application/atom+xml']") return nil if feed_link.blank? feed_link[0][:href] end # Return Hpricot parsed file from the url given as a params and set @xml with the data file def self.get_doc(site_url_to_open) @xml = fetch_page(url_with_protocol(site_url_to_open)) return nil unless @xml require 'hpricot' Hpricot(@xml) end def self.url_with_protocol(original_url) if original_url unless original_url =~ /[a-zA-Z]*\:/ return 'http://' + original_url end original_url end end end #CLASS end