|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- import os
- import pickle
- import datetime
- from urllib.parse import urlparse
- import requests
-
-
- class MyLoginSession:
- def __init__(self,
- loginUrl,
- loginData,
- loginTestUrl,
- loginTestString,
- sessionFileAppendix='_session.dat',
- maxSessionTimeSeconds=30 * 60,
- proxies=None,
- userAgent='Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
- debug=False,
- forceLogin=False,
- **kwargs):
- """
- save some information needed to login the session
-
- you'll have to provide 'loginTestString' which will be looked for in the
- responses html to make sure, you've properly been logged in
-
- 'proxies' is of format { 'https' : 'https://user:pass@server:port', 'http' : ...
- 'loginData' will be sent as post data (dictionary of id : value).
- 'maxSessionTimeSeconds' will be used to determine when to re-login.
- """
- urlData = urlparse(loginUrl)
-
- self.proxies = proxies
- self.loginData = loginData
- self.loginUrl = loginUrl
- self.loginTestUrl = loginTestUrl
- self.maxSessionTime = maxSessionTimeSeconds
- self.sessionFile = urlData.netloc + sessionFileAppendix
- self.userAgent = userAgent
- self.loginTestString = loginTestString
- self.debug = debug
-
- self.login(forceLogin, **kwargs)
-
- def modification_date(self, filename):
- """
- return last file modification date as datetime object
- """
- t = os.path.getmtime(filename)
- return datetime.datetime.fromtimestamp(t)
-
- def login(self, forceLogin=False, **kwargs):
- """
- login to a session. Try to read last saved session from cache file. If this fails
- do proper login. If the last cache access was too old, also perform a proper login.
- Always updates session cache file.
- """
- wasReadFromCache = False
- if self.debug:
- print('loading or generating session...')
- if os.path.exists(self.sessionFile) and not forceLogin:
- time = self.modification_date(self.sessionFile)
-
- # only load if file less than 30 minutes old
- lastModification = (datetime.datetime.now() - time).seconds
- if lastModification < self.maxSessionTime:
- with open(self.sessionFile, "rb") as f:
- self.session = pickle.load(f)
- wasReadFromCache = True
- if self.debug:
- print("loaded session from cache (last access %ds ago) "
- % lastModification)
- if not wasReadFromCache:
- self.session = requests.Session()
- self.session.headers.update({'user-agent': self.userAgent})
- res = self.session.post(self.loginUrl, data=self.loginData,
- proxies=self.proxies, **kwargs)
-
- if self.debug:
- print('created new session with login')
- self.saveSessionToCache()
-
- # test login
- res = self.session.get(self.loginTestUrl)
- if res.text.lower().find(self.loginTestString.lower()) < 0:
- if self.debug:
- print(res.text)
- raise Exception("could not log into provided site '%s'"
- " (did not find successful login string)"
- % self.loginUrl)
-
- def saveSessionToCache(self):
- """
- save session to a cache file
- """
- # always save (to update timeout)
- with open(self.sessionFile, "wb") as f:
- pickle.dump(self.session, f)
- if self.debug:
- print('updated session cache-file %s' % self.sessionFile)
-
- def retrieveContent(self, url, method="get", postData=None, postDataFiles=None, **kwargs):
- """
- return the content of the url with respect to the session.
-
- If 'method' is not 'get', the url will be called with 'postData'
- as a post request.
- """
- if method == 'get':
- res = self.session.get(url, proxies=self.proxies, **kwargs)
- else:
- res = self.session.post(url, data=postData, proxies=self.proxies, files=postDataFiles, **kwargs)
-
- # the session has been updated on the server, so also update in cache
- self.saveSessionToCache()
-
- return res
|