1. 首页 > 科技

python爬虫登录后获取新的cookie? python爬虫实训报告

python爬虫登录后获取新的cookie?python爬虫实训报告

CInternetSession::GetCookie方法获取cookies,怎样搞的?

这些函数都是可以从MSDN上查到用法的

static BOOL GetCookie(

LPCTSTR pstrUrl,

LPCTSTR pstrCookieName,

LPTSTR pstrCookieData,

DWORD dwBufLen

);

static BOOL GetCookie(

LPCTSTR pstrUrl,

LPCTSTR pstrCookieName,

CString& strCookieData

);

Parameters

pstrUrl

A pointer to a string containing the URL.

pstrCookieName

A pointer to a string containing the name of the cookie to get for the specified URL.

pstrCookieData

In the first overload, a pointer to a string containing the address of the buffer that receives the cookie data. This value can be NULL. In the second overload, a reference to a CString object to receive the cookie data.

dwBufLen

The variable specifying the size of the pstrCookieData buffer. If the function succeeds, the buffer receives the amount of data copied to the pstrCookieData buffer. If pstrCookieData is NULL, this parameter receives a value that specifies the size of the buffer necessary to copy all the cookie data.

Return Value

Returns TRUE if successful, or FALSE otherwise. If the call fails, call the Win32 function GetLastError to determine the cause of the error. The following error values apply:

ERROR_NO_MORE_ITEMS There is no cookie for the specified URL and all its parents.

ERROR_INSUFFICIENT_BUFFER The value passed in dwBufLen is insufficient to copy all the cookie data. The value returned in dwBufLen is the size of the buffer necessary to get all the data.

Python 爬取https的登录界面,怎么爬取成功,谢谢

之前写的一直没成功,原因是用的不是HTTPS相关的函数。这次仔细研究了一下,有几个需要注意的点,一个是POST模拟登陆的时候,header中的cookie值,不同的网站应该会有不同的要求;另一个是GET页面的时候,是需要加上POST得到的response中的set-cookie的。这样才能利用登陆的成功。

写完POST和GET页面后,顺便写了个简单的命令行实现。

import httplib, urllib

import urllib2

import cookielib

import sys

file_text = "build_change.txt"

resultTable = dict()

host = 'buuuuuuu.knight'

def Login(username, password , csrf =  'Gy2O70iSjOTbWhWgBLvf4HDuf4jUe4RP'):

 url = '/login/'

 values = {

   'username' : username,

   'password' : password,

   'next' : '',

   'csrfmiddlewaretoken': csrf,

 }

 headers = {

     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36',

     'Content-Type': 'application/x-www-form-urlencoded',

     'Connection' : 'keep-alive',

     'Cookie':'csrftoken=%s' % csrf ,  

     'Referer':'https://buuuuuuu.knight/login/',

     'Origin':'https://buuuuuuu.knight',

     'Content-Type':'application/x-www-form-urlencoded',

     'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

 }

 values = urllib.urlencode(values)

 conn = httplib.HTTPSConnection(host, 443)

 conn.request("POST", url, values, headers)

 response = conn.getresponse()

 print 'Login: ', response.status, response.reason

 '''

 hdata = response.getheaders()

 for i in xrange(len(hdata)):

  for j in xrange(len(hdata[i])):

   print hdata[i][j],

  print 

 '''

 return response.getheader("set-cookie")

def GetHtml(_url , cookie):

 get_headers = {

     'Host' : 'xxxxx.knight',

     'Connection' : 'keep-alive' , 

     'Cache-Control' : 'max-age=0',

     'Cookie' : cookie ,

     'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36',

     'Accept-Language' : 'zh-CN,zh;q=0.8,en;q=0.6',

 }

 conn=httplib.HTTPSConnection(host)

 conn.request("GET", _url,None,get_headers)

 res2=conn.getresponse()

 print "Get %s:" % _url ,res2.status, res2.reason

 '''

 hdata1 = res2.getheaders()

 for i in xrange(len(hdata1)):

  for j in xrange(len(hdata1[i])):

   print hdata1[i][j],

  print 

 '''

 data = res2.read()

 fp = open("build_change.txt","w")

 fp.write(data)

 fp.close()

def ParseHtml():

 fp = open(file_text,"r")

 content = fp.readline()

 _pos = 0

 while content:  

  if content.find("class=\"change-body\"") >= 0:

   topic = content.split(">")

   resultTable[_pos] = topic[1]

   while content:

    content = fp.readline()

    resultTable[_pos] = resultTable[_pos] + content

    if content.find("</div>")>= 0:

     _pos = _pos + 1

     break

  content = fp.readline()

 fp.close()

 print "Parse html success."

def GenerateResultTxt():

 f = open("build_change_result.txt","w")

 for m in resultTable.keys():

  f.write("-------------------------------------------------------------------------------------------\n")

  f.write(resultTable[m])

 f.close()

 print "Generate result success : build_change_result.txt ."

def Help():

 print '-h    :    help'

 print '-u    :    username(must)'

 print '-p    :    password(must)'

 print '-c    :    csrftoken(optional)'

 print '-s    :    sandbox build id(must)'

 print 'For example:'

 print '[1]  python BuildChange.py -h'

 print '[2]  python BuildChang.py -u u -p p -s s1 s2'

 print '[3]  python BuildChang.py -u u -p p -c c -s s1 s2'

def ParseParam(com):

 length = len(com)

 username = ""

 password = ""

 csrf = ""

 sid1 = ""

 sid2 = ""

 if length == 2 or length == 8 or length == 10:

  if com[1] == '-h':

   Help()

  for i in range(1,length):

   if com[i] == '-u' and i < (length-1):

    username = com[i+1]

    i += 1

   elif com[i] == '-p' and i < (length-1):

    password = com[i+1]

    i += 1

   elif com[i] == '-c' and i < (length-1):

    csrf = com[i+1]

    i += 1

   elif com[i] == '-s' and i < (length-2):

    sid1 = com[i+1]

    sid2 = com[i+2]

    i += 2

 if username == "" or password == "" or sid1 == "" or sid2 == "":

  print '[Error] Parameter error!'

  print '[Error] You can use \"python BuildChange.py -h\" to see how can use this script. '

 else:

  if csrf == "":

   cookie = Login(username, password)

  else:

   cookie = Login(username, password, csrf)

  _url = "//changelog//between//%s//and//%s/" % (sid1, sid2)

  GetHtml(_url, cookie)

  ParseHtml()

  GenerateResultTxt()

# C:\Python27\python.exe C:\Users\knight\Desktop\build\BuildChange.py -u xux -p KKKKKKKK -s 1859409 1858525

if __name__ == "__main__":

 ParseParam(sys.argv)

python爬虫网站的登录url怎么找

抓取网页所有url的简单Python爬虫源码,只用到了一个Python标准库urllib模块,没有用BeautifulSoup第三方库。python 多线程爬虫是一个很实用的工具。

Python爬虫源码发,如下:

import urllib

content = urllib.urlopen('http://www.iplaypython/').read()

s1=0

while s1>=0:

begin = content.find(r'<a',s1) m1="content.find(r'" href=",begin)

m2 = content.find(r">',m1)

s1 = m2

if(begin<=0):

break

elif(content[m1:m2].find(r" ")!=-1):

m2 = content[m1:m2].find(r' ')

url = content[m1+6:m1+m2-1]

print url

elif m2>=0:

url = content[m1+6:m2-1]

print url

print "end."

</a',s1)>

如何用 Python 爬取需要登录的网站

post获取COOKIE,然后带着COOKIE去爬