最近在看关于爬虫的东西,所以写了点练练手
目前还没竣工,只是到了能返回JSON数据的阶段
参考文献是:http://cuiqingcai.com/1001.html 作者写了一系列的爬虫教程,很实用!
用宜送转运的登录练练手吧,反正最近入了深圳海关坑,哎,。说多了都是泪
主要注意:
- 宜送登陆:HTTP头信息,Content-Type: application/x-www-form-urlencoded,不然直接提示登录失败。可能因为登录的人不多,目前没有设置验证码
- 获取账户订单信息:页面加载后,是AJAX向服务器二次请求JSON格式的订单信息的,而且发送的请求也是JSON格式来POST的。返回值也是JSON,直接免去了正则匹配啊!
# -*- coding: UTF-8 -*- __author__ = 'Bowei' import urllib import urllib2 import cookielib import json class YiCrawler: statusCode = { 'AIRARRDEST': '已到达', 'DONE': '已完成', 'CLEAR': '清关中', 'CONFIRM': '已确认' } def __init__(self): self.userinfo = {'j_username': '', 'j_password': ''} self.cookie = cookielib.CookieJar() self.headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Cache-Control': 'no-cache'} self.cookie_handler = urllib2.HTTPCookieProcessor(self.cookie) self.opener = urllib2.build_opener(self.cookie_handler) self.results = [] # 存放查询结果,队列中的元素是字典 def get_status_str(self, status): if self.statusCode.has_key(status): return self.statusCode[status] else: return status def set_user_info(self, username, password): self.userinfo['j_username'] = username self.userinfo['j_password'] = password def login(self): print 'Login start' login_data = urllib.urlencode(self.userinfo) url = 'https://www.yi-express.com/login/send' request = urllib2.Request(url, login_data, self.headers) try: response = self.opener.open(request) print 'Finished login' return response.getcode() except urllib2.HTTPError, e: print e.code return e.code def get_page(self, pageNo=1, orderType="", warehouse=""): url = 'https://www.yi-express.com/customer/orderSearch' order_params = {"pageNo": str(pageNo), "orderNoOrName": "", "warehouse": warehouse, "orderType": orderType, "orderCreateStart": "", "orderCreateEnd": ""} request = urllib2.Request(url) request.add_header('Content-Type', 'application/json') try: response = self.opener.open(request, json.dumps(order_params)) response_decoded = json.loads(response.read()) for key, value in response_decoded.iteritems(): # print str(key) + ': ' + str(value) if str(key) == 'result': self.results.extend(value) return response_decoded['hasNext'] except urllib2.HTTPError, e: print e.code return False def fetch_all(self): # 获取所有页面的数据 self.results = [] hasNext = True pageCount = 1 while hasNext: hasNext = self.get_page(pageCount) pageCount += 1 def print_results(self): print '========= START ===========' for item in self.results: print 'ID:\t' + item['id'] print 'TIME:\t' + item['inDateWithFormat'] print 'PRICE:\t' + str(item['realCost']) print 'WEIGHT:\t' + str(item['realWeight']) print 'EXP_NAME:\t' + item['expressName'] print 'STATUS:\t' + self.get_status_str(item['statusCode']) print '\n' if __name__ == '__main__': crawler = YiCrawler() print crawler.login() #crawler.get_page(pageNo=1) crawler.fetch_all() crawler.print_results()