How to scrape a website that requires login first with Python How to scrape a website that requires login first with Python python python

How to scrape a website that requires login first with Python


This works for me:

##################################### Method 1import mechanizeimport cookielibfrom BeautifulSoup import BeautifulSoupimport html2text# Browserbr = mechanize.Browser()# Cookie Jarcj = cookielib.LWPCookieJar()br.set_cookiejar(cj)# Browser optionsbr.set_handle_equiv(True)br.set_handle_gzip(True)br.set_handle_redirect(True)br.set_handle_referer(True)br.set_handle_robots(False)br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)br.addheaders = [('User-agent', 'Chrome')]# The site we will navigate into, handling it's sessionbr.open('https://github.com/login')# View available formsfor f in br.forms():    print f# Select the second (index one) form (the first form is a search query box)br.select_form(nr=1)# User credentialsbr.form['login'] = 'mylogin'br.form['password'] = 'mypass'# Loginbr.submit()print(br.open('https://github.com/settings/emails').read())

You were not far off at all!


would love to add my solution alongside . this answer mainly follows the hacky / lazy approach i always follow in everything i do. went on with mainly because , i was too lazy to handle the cookies, session data etc .

this solution is of most use if you want to scrape multiple pages of a website after logging in with single account credentials (eg all your pinterest boards) . not if u want to automate authentication using multiple accounts

so my solution is selenium along with firefox profiles.

  • Create a new firefox profile you create a new firefox profile, note the location where its stored, open firefox in corresponding profile. and login to the website manually . details about firefox profiles
  • now use selenium with this profile selenium session will use the cookies and session data from firefox profile so your authentication stays.

i devised this mechanism when i came across need to scrape few pinterest pages, i have added few lines of code from the sample showing how to use the profile. suit the code according to your needs.

from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.common.keys import Keysfrom selenium.webdriver.support.ui import Selectfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.common.exceptions import TimeoutExceptionfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.common.exceptions import NoSuchElementExceptionfrom selenium.common.exceptions import NoAlertPresentException#replace with your firefox profilefp=webdriver.FirefoxProfile('C:/Users/SJ/AppData/Roaming/Mozilla/Firefox/Profiles/hlsfrs2o.scrape')#enter your url hereurl=""driver = webdriver.Firefox(fp)driver.get(url)html_source = driver.page_source


The classic way to approach this problem is:

  1. launch a browser, go to site and search for the login page
  2. inspect the source code of the page to find out: I. which one is the login form (a page can have many forms, but usually one of them is the login form) II. which are the field names used for username and password (these could vary a lot) III. if there are other fields that must be submitted (like an authentication token)
  3. write the Scrapy spider to replicate the form submission using FormRequest

Being fans of automation, we figured we could write some code to automate point 2 (which is actually the most time-consuming) and the result is login form, a library to automatically fill login forms given the login page, username and password.Here is the code of a simple spider that would use loginform to login to sites automatically.

githubloginspider.py

from scrapy.spider import BaseSpiderfrom scrapy.http import FormRequestfrom scrapy.http.request import Requestfrom loginform import fill_login_formfrom scrapy import logfrom scraping.articles import ArticleItemclass GitHubLogin(BaseSpider):    name = 'GitHubLogin'    allowed_domains = ['github.com']    start_urls = ['http://github.com/login']    login_user = 'ranvijay5686'    login_pass = ''    def parse(self, response):        (args, url, method) = fill_login_form(response.url,                response.body, self.login_user, self.login_pass)        return FormRequest(url, method=method, formdata=args,                           callback=self.after_login)    def after_login(self, response):        # for link in response.xpath("//*[@id='site-container']/div[2]/div[4]/p/a/@href").extract():        item = ArticleItem()        item['title'] = 'ranvijay'        log.msg('***************    :   '                + str(response.xpath("//form[@class='subnav-search left']/input/@value"                ).extract()))        item['url'] = \            response.xpath("//*[@id='site-container']/div[1]/div/div/span/span/text()"                           ).extract()        yield item

items.py

from scrapy.item import Item, Fieldclass ArticleItem(Item):    title = Field()    url = Field()

loginform.py

import sysfrom argparse import ArgumentParserfrom collections import defaultdictfrom lxml import html__version__ = '1.0'  # also update setup.pydef _form_score(form):    score = 0    # In case of user/pass or user/pass/remember-me    if len(form.inputs.keys()) in (2, 3):        score += 10    typecount = defaultdict(int)    for x in form.inputs:        type_ = (x.type if isinstance(x, html.InputElement) else 'other'                 )        typecount[type_] += 1    if typecount['text'] > 1:        score += 10    if not typecount['text']:        score -= 10    if typecount['password'] == 1:        score += 10    if not typecount['password']:        score -= 10    if typecount['checkbox'] > 1:        score -= 10    if typecount['radio']:        score -= 10    return scoredef _pick_form(forms):    """Return the form most likely to be a login form"""    return sorted(forms, key=_form_score, reverse=True)[0]def _pick_fields(form):    """Return the most likely field names for username and password"""    userfield = passfield = emailfield = None    for x in form.inputs:        if not isinstance(x, html.InputElement):            continue        type_ = x.type        if type_ == 'password' and passfield is None:            passfield = x.name        elif type_ == 'text' and userfield is None:            userfield = x.name        elif type_ == 'email' and emailfield is None:            emailfield = x.name    return (userfield or emailfield, passfield)def submit_value(form):    """Returns the value for the submit input, if any"""    for x in form.inputs:        if x.type == 'submit' and x.name:            return [(x.name, x.value)]    else:        return []def fill_login_form(    url,    body,    username,    password,    ):    doc = html.document_fromstring(body, base_url=url)    form = _pick_form(doc.xpath('//form'))    (userfield, passfield) = _pick_fields(form)    form.fields[userfield] = username    form.fields[passfield] = password    form_values = form.form_values() + submit_value(form)    return (form_values, form.action or form.base_url, form.method)def main():    ap = ArgumentParser()    ap.add_argument('-u', '--username', default='username')    ap.add_argument('-p', '--password', default='secret')    ap.add_argument('url')    args = ap.parse_args()    try:        import requests    except ImportError:        print 'requests library is required to use loginform as a tool'    r = requests.get(args.url)    (values, action, method) = fill_login_form(args.url, r.text,            args.username, args.password)    print '''url: {0}method: {1}payload:'''.format(action, method)    for (k, v) in values:        print '- {0}: {1}'.format(k, v)if __name__ == '__main__':    sys.exit(main())