最近在看关于爬虫的东西,所以写了点练练手
目前还没竣工,只是到了能返回JSON数据的阶段
参考文献是:http://cuiqingcai.com/1001.html 作者写了一系列的爬虫教程,很实用!
用宜送转运的登录练练手吧,反正最近入了深圳海关坑,哎,。说多了都是泪
主要注意:
- 宜送登陆:HTTP头信息,Content-Type: application/x-www-form-urlencoded,不然直接提示登录失败。可能因为登录的人不多,目前没有设置验证码
- 获取账户订单信息:页面加载后,是AJAX向服务器二次请求JSON格式的订单信息的,而且发送的请求也是JSON格式来POST的。返回值也是JSON,直接免去了正则匹配啊!
# -*- coding: UTF-8 -*-
__author__ = 'Bowei'
import urllib
import urllib2
import cookielib
import json
class YiCrawler:
statusCode = {
'AIRARRDEST': '已到达',
'DONE': '已完成',
'CLEAR': '清关中',
'CONFIRM': '已确认'
}
def __init__(self):
self.userinfo = {'j_username': '',
'j_password': ''}
self.cookie = cookielib.CookieJar()
self.headers = {'Content-Type': 'application/x-www-form-urlencoded',
'Cache-Control': 'no-cache'}
self.cookie_handler = urllib2.HTTPCookieProcessor(self.cookie)
self.opener = urllib2.build_opener(self.cookie_handler)
self.results = [] # 存放查询结果,队列中的元素是字典
def get_status_str(self, status):
if self.statusCode.has_key(status):
return self.statusCode[status]
else:
return status
def set_user_info(self, username, password):
self.userinfo['j_username'] = username
self.userinfo['j_password'] = password
def login(self):
print 'Login start'
login_data = urllib.urlencode(self.userinfo)
url = 'https://www.yi-express.com/login/send'
request = urllib2.Request(url, login_data, self.headers)
try:
response = self.opener.open(request)
print 'Finished login'
return response.getcode()
except urllib2.HTTPError, e:
print e.code
return e.code
def get_page(self, pageNo=1, orderType="", warehouse=""):
url = 'https://www.yi-express.com/customer/orderSearch'
order_params = {"pageNo": str(pageNo), "orderNoOrName": "",
"warehouse": warehouse, "orderType": orderType,
"orderCreateStart": "", "orderCreateEnd": ""}
request = urllib2.Request(url)
request.add_header('Content-Type', 'application/json')
try:
response = self.opener.open(request, json.dumps(order_params))
response_decoded = json.loads(response.read())
for key, value in response_decoded.iteritems():
# print str(key) + ': ' + str(value)
if str(key) == 'result':
self.results.extend(value)
return response_decoded['hasNext']
except urllib2.HTTPError, e:
print e.code
return False
def fetch_all(self):
# 获取所有页面的数据
self.results = []
hasNext = True
pageCount = 1
while hasNext:
hasNext = self.get_page(pageCount)
pageCount += 1
def print_results(self):
print '========= START ==========='
for item in self.results:
print 'ID:\t' + item['id']
print 'TIME:\t' + item['inDateWithFormat']
print 'PRICE:\t' + str(item['realCost'])
print 'WEIGHT:\t' + str(item['realWeight'])
print 'EXP_NAME:\t' + item['expressName']
print 'STATUS:\t' + self.get_status_str(item['statusCode'])
print '\n'
if __name__ == '__main__':
crawler = YiCrawler()
print crawler.login()
#crawler.get_page(pageNo=1)
crawler.fetch_all()
crawler.print_results()