. .
open("http://...", :http_basic_authentication=>[user, password])
, .
.
require "open-uri"
require "zlib"
SHINSO_HEADERS = {
'Accept' => '*/*',
'Accept-Charset' => 'utf-8, windows-1251;q=0.7, *;q=0.6',
'Accept-Encoding' => 'gzip,deflate',
'Accept-Language' => 'bg-BG, bg;q=0.8, en;q=0.7, *;q=0.6',
'Connection' => 'keep-alive',
'Cookie' => '',
'From' => 'email@example.com',
'Referer' => 'http://svejo.net/',
'User-Agent' => 'Your user agent'
}
def crawl(url_address)
self.errors = Array.new
begin
begin
url_address = URI.parse(url_address)
rescue URI::InvalidURIError
url_address = URI.decode(url_address)
url_address = URI.encode(url_address)
url_address = URI.parse(url_address)
end
url_address.normalize!
stream = ""
timeout(8) { stream = url_address.open(SHINSO_HEADERS) }
if stream.size > 0
url_crawled = URI.parse(stream.base_uri.to_s)
else
self.errors << "Server said status 200 OK but document file is zero bytes."
return
end
rescue Exception => exception
self.errors << exception
return
end
end
url_crawled - , .
.
https://developer.mozilla.org/en-US/docs/HTTP_access_control
If you still experience an error, your server may not be configured correctly, the certificate will be wise, and you should check this.
And if you're serious about parsing, you can also use the CharGuess and Zlib stones to read content rights, and then convert the problematic ones with Iconv. Here is an example.
if stream.content_encoding.include?('gzip')
document = Zlib::GzipReader.new(stream).read
elsif stream.content_encoding.include?('deflate')
document = Zlib::Deflate.new().deflate(stream).read
else
document = stream.read
end
self.charset_guess = CharGuess.guess(document)
Then just use Iconv on the content.
Hope this helps you.
Regards, Yavor
source
share