diff --git a/.gitignore b/.gitignore index 1377554..23e0bfb 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ *.swp +Gemfile.lock +*.gem diff --git a/COPYING b/COPYING index be5ea3f..acbbd20 100644 --- a/COPYING +++ b/COPYING @@ -1,4 +1,4 @@ -Copyright (C) 2012 David Moreno +Copyright (C) 2008-2014 David Moreno Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the diff --git a/ChangeLog b/ChangeLog index ca257d3..b4cdb5e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,17 @@ +* 0.9.4 - Sun Aug 3 13:51:09 CEST 2014 + - Enhancements to the spec. + - Updated years on license notes + +* 0.9.3 - Sun Aug 3 13:48:01 CEST 2014 + - Rebuild of the gem to get rid of Hpricot. + - Appropriate build on Nokogiri. + +* 0.9.2 - Sat Dec 7 18:32:17 CET 2013 + - Up to date bits and pieces. + +* 0.9.1 - Fri Mar 16 12:00:00 EDT 2012 + - Small error on the release date. + * 0.9 - Fri Mar 16 10:59:00 EDT 2012 - Changed license to MIT. diff --git a/Gemfile.lock b/Gemfile.lock deleted file mode 100644 index 5673205..0000000 --- a/Gemfile.lock +++ /dev/null @@ -1,32 +0,0 @@ -PATH - remote: . - specs: - feedbag (0.9.1) - nokogiri - -GEM - remote: https://rubygems.org/ - specs: - activesupport (3.2.13) - i18n (= 0.6.1) - multi_json (~> 1.0) - i18n (0.6.1) - metaclass (0.0.1) - mocha (0.12.10) - metaclass (~> 0.0.1) - multi_json (1.7.2) - nokogiri (1.5.9) - shoulda (3.4.0) - shoulda-context (~> 1.0, >= 1.0.1) - shoulda-matchers (~> 1.0, >= 1.4.1) - shoulda-context (1.1.0) - shoulda-matchers (1.4.1) - activesupport (>= 3.0.0) - -PLATFORMS - ruby - -DEPENDENCIES - feedbag! - mocha (~> 0.12.0) - shoulda diff --git a/README.markdown b/README.markdown index 88f814b..c4c9de2 100644 --- a/README.markdown +++ b/README.markdown @@ -1,97 +1,51 @@ Feedbag ======= -Feedbag is a Ruby library for the auto-discovery of syndicated feeds (RSS/Atom). +Yet another fork of Feedbag, Ruby's favorite auto-discovery tool/library! ### Quick synopsis - >> require "rubygems" - => true - >> require "feedbag" - => true - >> Feedbag.find "http://damog.nl/blog" - => ["http://damog.net/blog/index.rss", "http://damog.net/blog/tags/feed", "http://damog.net/blog/tags/rfeed"] - >> Feedbag.feed?("google.com") - => false - >> Feedbag.feed?("http://planet.debian.org/rss20.xml") - => true + >> require "feedbag" + => true + >> Feedbag.find "damog.net/blog" + => ["http://damog.net/blog/index.rss", "http://damog.net/blog/tags/feed", "http://damog.net/blog/tags/rfeed"] + >> Feedbag.feed? "perl.org" + => false + >> Feedbag.feed?("http://jobs.perl.org/rss/standard.rss") + => true ### Installation - $ gem install feedbag + $ gem install feedbag Or just grab feedbag.rb and use it on your own project: - $ wget http://github.com/damog/feedbag/raw/master/lib/feedbag.rb + $ wget http://github.com/damog/feedbag/raw/master/lib/feedbag.rb -## Tutorial +You can also use the command line tool for quick queries, if you install the gem: -So you want to know more about it. - -OK, if the URL passed to the find method is a feed itself, that only feed URL will be returned. - - >> Feedbag.find "github.com/damog.atom" - => ["http://github.com/damog.atom"] - >> - -Otherwise, it will always return LINK feeds first, A (anchor tags) feeds later. Between A feeds, the ones hosted on the same URL's host, will have larger priority: - - >> Feedbag.find "http://ve.planetalinux.org" - => ["http://feedproxy.google.com/PlanetaLinuxVenezuela", "http://rendergraf.wordpress.com/feed/", "http://rootweiller.wordpress.com/feed/", "http://skatox.com/blog/feed/", "http://kodegeek.com/atom.xml", "http://blog.0x29.com.ve/?feed=rss2&cat=8"] - >> - -On your application you should only take the very first element of the array, most of the times: - - >> Feedbag.find("planet.debian.org").first(3) - => ["http://planet.debian.org/rss10.xml", "http://planet.debian.org/rss20.xml", "http://planet.debian.org/atom.xml"] - >> - -(Try running that same example without the "first" method. That example's host is a blog aggregator, so it has hundreds of feed URLs:) - - >> Feedbag.find("planet.debian.org").size - => 104 - >> - -Feedbag will find them all, but it will return the most important ones on the first elements on the array returned. - - >> Feedbag.find("cnn.com") - => ["http://rss.cnn.com/rss/cnn_topstories.rss", "http://rss.cnn.com/rss/cnn_latest.rss", "http://rss.cnn.com/services/podcasting/robinmeade/rss.xml"] - >> + $ feedbag http://rubygems.org/profiles/damog + == http://rubygems.org/profiles/damog: + - http://feeds.feedburner.com/gemcutter-latest ### Why should you use it? -- Because it's cool. - Because it only uses [Nokogiri](http://nokogiri.org/) as dependency. - Because it follows modern feed filename conventions (like those ones used by WordPress blogs, or Blogger, etc). - Because it's a single file you can embed easily in your application. +- Because it's faster than rfeedfinder. -### Why did I build it? - -- Because I liked Benjamin Trott's [Feed::Find](http://search.cpan.org/~btrott/Feed-Find-0.06/lib/Feed/Find.pm). -- Because I thought it would be good to have Feed::Find's functionality in Ruby. -- Because I thought it was going to be easy to maintain. -- Because I was going to use it on [rFeed](http://github.com/damog/rfeed). -- And finally, because I didn't know [rfeedfinder](http://rfeedfinder.rubyforge.org/) existed :-) - -### Bugs - -Please, report bugs to [rt@support.axiombox.com](rt@support.axiombox.com) or directly to the author. +### Why the fork? -### Contribute +This fork introduce one new dependency to support HTTP redirection flow. Another enhancement such as: -> git clone git://github.com/damog/feedbag.git +- Support for custom user agent, just set your preferred user agent from environment variable: `FEEDBAG_UA` +- Better encoding conversion, specifically for Japanese sites with non utf8 encoding. -...patch, build, hack and make pull requests. I'll be glad. - -### Author +### Original Author [David Moreno](http://damog.net/) <[david@axiombox.com](mailto:david@axiombox.com)>. ### Copyright -This is free software. See [COPYING](http://github.com/damog/feedbag/master/COPYING) for more information. - -### Thanks - -[Raquel](http://maggit.net), for making [Axiombox](http://axiombox.com) and most of my dreams possible. Also, [GitHub](http://github.com) for making a nice code sharing service that doesn't suck. - +This is free software. See [COPYING](https://raw.githubusercontent.com/damog/feedbag/master/COPYING) for more information. diff --git a/TODO b/TODO deleted file mode 100644 index f3fe1e2..0000000 --- a/TODO +++ /dev/null @@ -1 +0,0 @@ -- Document Feedbag.feed? diff --git a/feedbag.gemspec b/feedbag.gemspec index cef181a..70c8e23 100644 --- a/feedbag.gemspec +++ b/feedbag.gemspec @@ -1,13 +1,15 @@ # -*- encoding: utf-8 -*- - + Gem::Specification.new do |s| s.name = %q{feedbag} - s.version = "0.9.1" - s.homepage = "http://axiombox.com/feedbag" + s.version = "0.9.5" + s.homepage = "http://github.com/damog/feedbag" s.rubyforge_project = "feedbag" - + s.authors = ["Axiombox", "David Moreno", "Derek Willis"] s.date = %q{2012-03-16} + s.licenses = ["MIT"] + s.description = %q{Ruby's favorite feed auto-discoverty tool} s.email = %q{david@axiombox.com} s.extra_rdoc_files = ["README.markdown", "COPYING"] @@ -15,9 +17,13 @@ Gem::Specification.new do |s| s.has_rdoc = true s.rdoc_options = ["--main", "README.markdown"] s.summary = %q{Ruby's favorite feed auto-discovery tool} - s.add_dependency("nokogiri") - s.add_development_dependency "shoulda" - s.add_development_dependency "mocha", "~> 0.12.0" + + s.add_dependency("nokogiri") + s.add_dependency("open_uri_redirections") + + s.add_development_dependency "shoulda" + s.add_development_dependency "mocha", ">= 0.12.0" + s.bindir = 'bin' s.default_executable = %q{feedbag} s.executables = ["feedbag"] diff --git a/index.html b/index.html deleted file mode 100644 index 0b02147..0000000 --- a/index.html +++ /dev/null @@ -1,115 +0,0 @@ -

Feedbag

- -
-

Do you want me to drag my sack across your face? - - Glenn Quagmire

-
- -

Feedbag is a feed auto-discovery Ruby library. You don't need to know more about it. It is said to be:

- -
-

Ruby's favorite auto-discovery tool/library!

-
- -

Quick synopsis

- -
>> require "rubygems"
-=> true
->> require "feedbag"
-=> true
->> Feedbag.find "log.damog.net"
-=> ["http://feeds.feedburner.com/TeoremaDelCerdoInfinito", "http://log.damog.net/comments/feed/"]
-
- -

Installation

- -
$ sudo gem install damog-feedbag -s http://gems.github.com/
-
- -

Or just grab feedbag.rb and use it on your own project:

- -
$ wget http://github.com/damog/feedbag/raw/master/lib/feedbag.rb
-
- -

Tutorial

- -

So you want to know more about it.

- -

OK, if the URL passed to the find method is a feed itself, that only feed URL will be returned.

- -
>> Feedbag.find "github.com/damog.atom"
-=> ["http://github.com/damog.atom"]
->>
-
- -

Otherwise, it will always return LINK feeds first, A (anchor tags) feeds later. Between A feeds, the ones hosted on the same URL's host, will have larger priority:

- -
>> Feedbag.find "http://ve.planetalinux.org"
-=> ["http://feedproxy.google.com/PlanetaLinuxVenezuela", "http://rendergraf.wordpress.com/feed/", "http://rootweiller.wordpress.com/feed/", "http://skatox.com/blog/feed/", "http://kodegeek.com/atom.xml", "http://blog.0x29.com.ve/?feed=rss2&cat=8"]
->>
-
- -

On your application you should only take the very first element of the array, most of the times:

- -
>> Feedbag.find("planet.debian.org").first(3)
-=> ["http://planet.debian.org/rss10.xml", "http://planet.debian.org/rss20.xml", "http://planet.debian.org/atom.xml"]
->>
-
- -

(Try running that same example without the "first" method. That example's host is a blog aggregator, so it has hundreds of feed URLs:)

- -
>> Feedbag.find("planet.debian.org").size
-=> 104
->>
-
- -

Feedbag will find them all, but it will return the most important ones on the first elements on the array returned.

- -
>> Feedbag.find("cnn.com")
-=> ["http://rss.cnn.com/rss/cnn_topstories.rss", "http://rss.cnn.com/rss/cnn_latest.rss", "http://rss.cnn.com/services/podcasting/robinmeade/rss.xml"]
->>
-
- -

Why should you use it?

- - - -

Why did I build it?

- - - -

Bugs

- -

Please, report bugs to rt@support.axiombox.com or directly to the author.

- -

Contribute

- -
-

git clone git://github.com/damog/feedbag.git

-
- -

...patch, build, hack and make pull requests. I'll be glad.

- -

Author

- -

David Moreno <david@axiombox.com>.

- -

Copyright

- -

This is free software. See COPYING for more information.

- -

Thanks

- -

Raquel, for making Axiombox and most of my dreams possible. Also, GitHub for making a nice code sharing service that doesn't suck.

diff --git a/lib/feedbag.rb b/lib/feedbag.rb index e44def9..7e3deaf 100644 --- a/lib/feedbag.rb +++ b/lib/feedbag.rb @@ -1,6 +1,6 @@ #!/usr/bin/ruby -# Copyright (c) 2012 David Moreno +# Copyright (c) 2008-2014 David Moreno # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the @@ -24,18 +24,22 @@ require "rubygems" require "nokogiri" require "open-uri" +require "open_uri_redirections" require "net/http" +require "kconv" + +USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36" class Feedbag - CONTENT_TYPES = [ - 'application/x.atom+xml', - 'application/atom+xml', - 'application/xml', - 'text/xml', - 'application/rss+xml', - 'application/rdf+xml', - ].freeze + CONTENT_TYPES = [ + 'application/x.atom+xml', + 'application/atom+xml', + 'application/xml', + 'text/xml', + 'application/rss+xml', + 'application/rdf+xml', + ].freeze def self.feed?(url) new.feed?(url) @@ -49,160 +53,161 @@ def initialize @feeds = [] end - def feed?(url) - # use LWR::Simple.normalize some time - url_uri = URI.parse(url) - url = "#{url_uri.scheme or 'http'}://#{url_uri.host}#{url_uri.path}" - url << "?#{url_uri.query}" if url_uri.query - - # hack: - url.sub!(/^feed:\/\//, 'http://') - - res = Feedbag.find(url) - if res.size == 1 and res.first == url - return true - else - return false - end - end - - def find(url, args = {}) - url_uri = URI.parse(url) - url = nil - if url_uri.scheme.nil? - url = "http://#{url_uri.to_s}" - elsif url_uri.scheme == "feed" - return self.add_feed(url_uri.to_s.sub(/^feed:\/\//, 'http://'), nil) - else - url = url_uri.to_s - end - #url = "#{url_uri.scheme or 'http'}://#{url_uri.host}#{url_uri.path}" - - # check if feed_valid is avail + def feed?(url) + # use LWR::Simple.normalize some time + url_uri = URI.parse(url) + url = "#{url_uri.scheme or 'http'}://#{url_uri.host}#{url_uri.path}" + url << "?#{url_uri.query}" if url_uri.query + + # hack: + url.sub!(/^feed:\/\//, 'http://') + + res = Feedbag.find(url) + if res.size == 1 and res.first == url + return true + else + return false + end + end + + def find(url, args = {}) + url_uri = URI.parse(url) + url = nil + if url_uri.scheme.nil? + url = "http://#{url_uri.to_s}" + elsif url_uri.scheme == "feed" + return self.add_feed(url_uri.to_s.sub(/^feed:\/\//, 'http://'), nil) + else + url = url_uri.to_s + end + #url = "#{url_uri.scheme or 'http'}://#{url_uri.host}#{url_uri.path}" + + # check if feed_valid is avail begin - require "feed_validator" - v = W3C::FeedValidator.new - v.validate_url(url) - return self.add_feed(url, nil) if v.valid? - rescue LoadError - # scoo - rescue REXML::ParseException - # usually indicates timeout - # TODO: actually find out timeout. use Terminator? - # $stderr.puts "Feed looked like feed but might not have passed validation or timed out" + require "feed_validator" + v = W3C::FeedValidator.new + v.validate_url(url) + return self.add_feed(url, nil) if v.valid? + rescue LoadError + # scoo + rescue REXML::ParseException + # usually indicates timeout + # TODO: actually find out timeout. use Terminator? + # $stderr.puts "Feed looked like feed but might not have passed validation or timed out" rescue => ex - $stderr.puts "#{ex.class} error ocurred with: `#{url}': #{ex.message}" - end - - begin - html = open(url) do |f| - content_type = f.content_type.downcase - if content_type == "application/octet-stream" # open failed - content_type = f.meta["content-type"].gsub(/;.*$/, '') - end - if CONTENT_TYPES.include?(content_type) - return self.add_feed(url, nil) - end - - doc = Nokogiri::HTML(f.read) - - if doc.at("base") and doc.at("base")["href"] - @base_uri = doc.at("base")["href"] - else - @base_uri = nil - end - - # first with links + $stderr.puts "#{ex.class} error ocurred with: `#{url}': #{ex.message}" + end + + begin + user_agent = ENV["FEEDBAG_UA"] || USER_AGENT + html = open(url, 'r:binary', "User-Agent" => user_agent, :allow_redirections => :safe) do |f| + content_type = f.content_type.downcase + if content_type == "application/octet-stream" # open failed + content_type = f.meta["content-type"].gsub(/;.*$/, '') + end + if CONTENT_TYPES.include?(content_type) + return self.add_feed(url, nil) + end + + doc = Nokogiri::HTML.parse(f.read.toutf8, nil, 'UTF-8') + + if doc.at("base") and doc.at("base")["href"] + @base_uri = doc.at("base")["href"] + else + @base_uri = nil + end + + # first with links (doc/"atom:link").each do |l| - next unless l["rel"] - if l["type"] and CONTENT_TYPES.include?(l["type"].downcase.strip) and l["rel"].downcase == "self" - self.add_feed(l["href"], url, @base_uri) - end - end - - (doc/"link").each do |l| - next unless l["rel"] - if l["type"] and CONTENT_TYPES.include?(l["type"].downcase.strip) and (l["rel"].downcase =~ /alternate/i or l["rel"] == "service.feed") - self.add_feed(l["href"], url, @base_uri) - end - end - - (doc/"a").each do |a| - next unless a["href"] - if self.looks_like_feed?(a["href"]) and (a["href"] =~ /\// or a["href"] =~ /#{url_uri.host}/) - self.add_feed(a["href"], url, @base_uri) - end - end - - (doc/"a").each do |a| - next unless a["href"] - if self.looks_like_feed?(a["href"]) - self.add_feed(a["href"], url, @base_uri) - end - end + next unless l["rel"] + if l["type"] and CONTENT_TYPES.include?(l["type"].downcase.strip) and l["rel"].downcase == "self" + self.add_feed(l["href"], url, @base_uri) + end + end + + (doc/"link").each do |l| + next unless l["rel"] + if l["type"] and CONTENT_TYPES.include?(l["type"].downcase.strip) and (l["rel"].downcase =~ /alternate/i or l["rel"] == "service.feed") + self.add_feed(l["href"], url, @base_uri) + end + end + + (doc/"a").each do |a| + next unless a["href"] + if self.looks_like_feed?(a["href"]) and (a["href"] =~ /\// or a["href"] =~ /#{url_uri.host}/) + self.add_feed(a["href"], url, @base_uri) + end + end + + (doc/"a").each do |a| + next unless a["href"] + if self.looks_like_feed?(a["href"]) + self.add_feed(a["href"], url, @base_uri) + end + end # Added support for feeds like http://tabtimes.com/tbfeed/mashable/full.xml if url.match(/.xml$/) and doc.root and doc.root["xml:base"] and doc.root["xml:base"].strip == url.strip - self.add_feed(url, nil) + self.add_feed(url, nil) end - end - rescue Timeout::Error => err - $stderr.puts "Timeout error ocurred with `#{url}: #{err}'" - rescue OpenURI::HTTPError => the_error - $stderr.puts "Error ocurred with `#{url}': #{the_error}" - rescue SocketError => err - $stderr.puts "Socket error ocurred with: `#{url}': #{err}" - rescue => ex - $stderr.puts "#{ex.class} error ocurred with: `#{url}': #{ex.message}" - ensure - return @feeds - end - - end - - def looks_like_feed?(url) - if url =~ /(\.(rdf|xml|rdf|rss)$|feed=(rss|atom)|(atom|feed)\/?$)/i - true - else - false - end - end - - def add_feed(feed_url, orig_url, base_uri = nil) - # puts "#{feed_url} - #{orig_url}" - url = feed_url.sub(/^feed:/, '').strip - - if base_uri - # url = base_uri + feed_url - url = URI.parse(base_uri).merge(feed_url).to_s - end - - begin - uri = URI.parse(url) - rescue - puts "Error with `#{url}'" - exit 1 - end - unless uri.absolute? - orig = URI.parse(orig_url) - url = orig.merge(url).to_s - end - - # verify url is really valid - @feeds.push(url) unless @feeds.include?(url)# if self._is_http_valid(URI.parse(url), orig_url) - end - - # not used. yet. - def _is_http_valid(uri, orig_url) - req = Net::HTTP.get_response(uri) - orig_uri = URI.parse(orig_url) - case req - when Net::HTTPSuccess then - return true - else - return false - end - end + end + rescue Timeout::Error => err + $stderr.puts "Timeout error ocurred with `#{url}: #{err}'" + rescue OpenURI::HTTPError => the_error + $stderr.puts "Error ocurred with `#{url}': #{the_error}" + rescue SocketError => err + $stderr.puts "Socket error ocurred with: `#{url}': #{err}" + rescue => ex + $stderr.puts "#{ex.class} error ocurred with: `#{url}': #{ex.message}" + ensure + return @feeds + end + + end + + def looks_like_feed?(url) + if url =~ /(\.(rdf|xml|rss)$|feed=(rss|atom)|(atom|feed)\/?$)/i + true + else + false + end + end + + def add_feed(feed_url, orig_url, base_uri = nil) + # puts "#{feed_url} - #{orig_url}" + url = feed_url.sub(/^feed:/, '').strip + + if base_uri + # url = base_uri + feed_url + url = URI.parse(base_uri).merge(feed_url).to_s + end + + begin + uri = URI.parse(url) + rescue + puts "Error with `#{url}'" + exit 1 + end + unless uri.absolute? + orig = URI.parse(orig_url) + url = orig.merge(url).to_s + end + + # verify url is really valid + @feeds.push(url) unless @feeds.include?(url)# if self._is_http_valid(URI.parse(url), orig_url) + end + + # not used. yet. + def _is_http_valid(uri, orig_url) + req = Net::HTTP.get_response(uri) + orig_uri = URI.parse(orig_url) + case req + when Net::HTTPSuccess then + return true + else + return false + end + end end if __FILE__ == $0 diff --git a/test/feedbag_test.rb b/test/feedbag_test.rb index acd409f..b0f294f 100644 --- a/test/feedbag_test.rb +++ b/test/feedbag_test.rb @@ -1,7 +1,7 @@ require 'test_helper' -class FeedbagTest < Test::Unit::TestCase - +class FeedbagTest < MiniTest::Unit::TestCase + context "Feedbag.feed? should know that an RSS url is a feed" do setup do @rss_url = 'http://example.com/rss/' @@ -11,7 +11,7 @@ class FeedbagTest < Test::Unit::TestCase assert Feedbag.feed?(@rss_url) end end - + context "Feedbag.feed? should know that an RSS url with parameters is a feed" do setup do @rss_url = "http://example.com/data?format=rss" diff --git a/test/test_helper.rb b/test/test_helper.rb index 6f6a2d8..87befa3 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -1,8 +1,8 @@ require 'rubygems' -require 'test/unit' +require 'minitest/autorun' require 'shoulda' require 'mocha/setup' $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib')) $LOAD_PATH.unshift(File.dirname(__FILE__)) -require 'feedbag' \ No newline at end of file +require 'feedbag'