How to check if a value has changed on a website

Basically I am trying to run some code (Python 3.2) if the value on the website changes, otherwise wait a bit and check it later.

At first I thought that I could just save the value in a variable and compare it with the new value that was selected the next time the script was run. But this quickly ran into problems, as the value was overwritten when the script ran again and initialized this variable.

So, I tried just saving the html of the webpage as a file and then comparing it with the html that will be called the next time the script is run. No luck was the fact that he continued to rise False, even when there were no changes.

Then up was the etching of the web page, and then an attempt to compare it with html. Interestingly, this did not work in the script. BUT, if I type the file = pickle.load (open ('D: \ Download \ htmlString.p', 'rb')) after running the script and then the file == html, it shows True when there wasn’t any any changes.

I am a little confused as to why this will not work when the script is running, but if I do above, it will show the correct answer.

Edit: Thanks for the answers so far guys. The question I had was not quite about other ways to solve this question (although it is always useful to find out more ways to complete the task!), But rather, why the code below does not work when it runs as a script, but if I reload the brine object at the prompt after running the script and then checking it against html, it will return True if there were no changes.

try: 
    file = pickle.load( open( 'D:\\Download\\htmlString.p', 'rb'))
    if pickle.load( open( 'D:\\Download\\htmlString.p', 'rb')) == htmlString:
        print("Values haven't changed!")
        sys.exit(0)
    else:
        pickle.dump( htmlString, open( 'D:\\Download\\htmlString.p', "wb" ) )  
        print('Saving')
except: 
    pickle.dump( htmlString, open( 'D:\\Download\\htmlString.p', "wb" ) )
    print('ERROR')
+5
source share
4 answers

Change . I did not understand that you were just looking for a problem with your script. Here's what I think is the problem, followed by my original answer, which addresses a different approach to the bigger problem you are trying to solve.

script - except: . sys.exit(0).

, try, , D:\Download\htmlString.p . IOError, except IOError:

script , , except:

import sys
import pickle
import urllib2

request = urllib2.Request('http://www.iana.org/domains/example/')
response = urllib2.urlopen(request) # Make the request
htmlString = response.read()

try: 
    file = pickle.load( open( 'D:\\Download\\htmlString.p', 'rb'))
    if file == htmlString:
        print("Values haven't changed!")
        sys.exit(0)
    else:
        pickle.dump( htmlString, open( 'D:\\Download\\htmlString.p', "wb" ) )  
        print('Saving')
except IOError: 
    pickle.dump( htmlString, open( 'D:\\Download\\htmlString.p', "wb" ) )
    print('Created new file.')

os.path - - , script , .

2: URL-.

, . , HTML , .

import sys
import pickle
import urllib2

request = urllib2.Request('http://ecal.forexpros.com/e_cal.php?duration=weekly')
response = urllib2.urlopen(request) # Make the request
# Grab everything before the dynabic double-click link
htmlString = response.read().split('<iframe src="http://fls.doubleclick')[0]

try: 
    file = pickle.load( open( 'D:\\Download\\htmlString.p', 'r'))
    if pickle.load( open( 'D:\\Download\\htmlString.p', 'r')) == htmlString:
        print("Values haven't changed!")
        sys.exit(0)
    else:
        pickle.dump( htmlString, open( 'D:\\Download\\htmlString.p', "w" ) )  
        print('Saving')
except IOError: 
    pickle.dump( htmlString, open( 'D:\\Download\\htmlString.p', "w" ) )
    print('Created new file.')

HTML-, . , - . , , , , , , .

- .

-? HTTP Last-Modified, , , ( , ). HEAD, . , .

If-Modified-Since, , .

, - :

import sys
import os.path
import urllib2

url = 'http://www.iana.org/domains/example/'
saved_time_file = 'last time check.txt'

request = urllib2.Request(url)
if os.path.exists(saved_time_file):
    """ If we've previously stored a time, get it and add it to the request"""
    last_time = open(saved_time_file, 'r').read()
    request.add_header("If-Modified-Since", last_time)

try:
    response = urllib2.urlopen(request) # Make the request
except urllib2.HTTPError, err:
    if err.code == 304:
        print "Nothing new."
        sys.exit(0)
    raise   # some other http error (like 404 not found etc); re-raise it.

last_modified = response.info().get('Last-Modified', False)
if last_modified:
    open(saved_time_file, 'w').write(last_modified)
else:
    print("Server did not provide a last-modified property. Continuing...")
    """
    Alternately, you could save the current time in HTTP-date format here:
    http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3
    This might work for some servers that don't provide Last-Modified, but do
    respect If-Modified-Since.
    """

"""
You should get here if the server won't confirm the content is old.
Hopefully, that means it new.
HTML should be in response.read().
"""

Stii, . ETags, , .

+7

HEAD .

import urllib2
"""
read old length from file into variable
"""
request = urllib2.Request('http://www.yahoo.com')
request.get_method = lambda : 'HEAD'

response = urllib2.urlopen(request)
new_length = response.info()["Content-Length"]
if old_length != new_length:
    print "something has changed"

, , , . , .

+3

, . . while.

import hashlib
import urllib

num_checks = 20
last_check = 1
while last_check != num_checks:
  remote_data = urllib.urlopen('http://remoteurl').read()
  remote_hash = hashlib.md5(remote_data).hexdigest()

  local_data = open('localfilepath').read()
  local_hash = hashlib.md5(local_data).hexdigest()
  if remote_hash == local_hash:
    print 'right now, we match!'
  else:
    print 'right now, we are different'

, .

+1

I did not quite understand if you just need to see if the site has changed, or if you are going to do more with the website data. If this is the first, definitely a hash, as mentioned earlier. Here is a working example (python 2.6.1 on mac) that compares the complete old html with the new html; it should be easily modified, so it uses hashes or only a certain part of the website as you need. Let's hope the comments and docstrings make it all clear.

import urllib2

def getFilename(url):
    '''
    Input: url
    Return: a (string) filename to be used later for storing the urls contents
    '''
    return str(url).lstrip('http://').replace("/",":")+'.OLD'


def getOld(url):
    '''
    Input: url- a string containing a url
    Return: a string containing the old html, or None if there is no old file
    (checks if there already is a url.OLD file, and make an empty one if there isn't to handle the case that this is the first run)
    Note: the file created with the old html is the format url(with : for /).OLD
    '''
    oldFilename = getFilename(url)
    oldHTML = ""
    try:
        oldHTMLfile = open(oldFilename,'r')
    except:
        # file doesn't exit! so make it
        with open(oldFilename,'w') as oldHTMLfile:
            oldHTMLfile.write("")
        return None
    else:
        oldHTML = oldHTMLfile.read()
        oldHTMLfile.close()

    return oldHTML

class ConnectionError(Exception):
    def __init__(self, value):
        if type(value) != type(''):
            self.value = str(value)
        else:
            self.value = value
    def __str__(self):
        return 'ConnectionError: ' + self.value       


def htmlHasChanged(url):
    '''
    Input: url- a string containing a url
    Return: a boolean stating whether the website at url has changed
    '''

    try:
        fileRecvd = urllib2.urlopen(url).read()
    except:
        print 'Could not connect to %s, sorry!' % url
        #handle bad connection error...
        raise ConnectionError("urlopen() failed to open " + str(url))
    else:
        oldHTML = getOld(url)
        if oldHTML == fileRecvd:
            hasChanged = False
        else:
            hasChanged = True

        # rewrite file
        with open(getFilename(url),'w') as f:
            f.write(fileRecvd)

        return hasChanged

if __name__ == '__main__':
    # test it out with whatismyip.com
    try:
        print htmlHasChanged("http://automation.whatismyip.com/n09230945.asp")
    except ConnectionError,e:
        print e
0
source

All Articles