Google . - , reppy, , . pip;
pip install reppy
( IPython) reppy, , Google robots.txt
In [1]: import reppy
In [2]: x = reppy.fetch("http://google.com/robots.txt")
In [3]: x.atts
Out[3]:
{'agents': {'*': <reppy.agent at 0x1fd9610>},
'sitemaps': ['http://www.gstatic.com/culturalinstitute/sitemaps/www_google_com_culturalinstitute/sitemap-index.xml',
'http://www.google.com/hostednews/sitemap_index.xml',
'http://www.google.com/sitemaps_webmasters.xml',
'http://www.google.com/ventures/sitemap_ventures.xml',
'http://www.gstatic.com/dictionary/static/sitemaps/sitemap_index.xml',
'http://www.gstatic.com/earth/gallery/sitemaps/sitemap.xml',
'http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
'http://www.gstatic.com/trends/websites/sitemaps/sitemapindex.xml']}
In [4]: x.allowed("/catalogs/about", "My_crawler")
Out[4]: True
In [5]: x.allowed("/catalogs", "My_crawler")
Out[5]: False
In [7]: x.allowed("/catalogs/p?", "My_crawler")
Out[7]: True
In [8]: x.refresh()
In [9]: x.ttl
Out[9]: 3721.3556718826294
In [10]:
user689383