1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
| import json import socket import urllib.error import requests import urllib.request import urllib.parse from bs4 import BeautifulSoup import re import datetime import time import os import xlwings as xw import pandas as pd from pandas import Series, DataFrame from icecream import ic
headers = { 'cookie': 'SESSIONID=NbZdSDM9vxjCm49QloVUek1p4SESon111VN5lO28kIn; JOID=UFsVAkiKe3CkNfAXfIk7aNWUKxBp0hE3kWC-TR7bCwudQoEkFfollsI18hV_tVbo9R13y8QFnN59RSz0mIJJkc8=; osd=V1kVB0uNeXChNvcVfIw4b9eULhNu0BEykme8TRvYDAmdR4IjF_oglcU38hB8slTo8B5wycQAn9l_RSn3n4BJlMw=; __snaker__id=ShIwuWTxIbqCt5mH; _xsrf=MRidZTLvBDfDoBlr8SxykMmNOIownvTz; _zap=5ab57ada-5fad-4c43-bad7-98ba7cfd8b70; d_c0=AaDYTqHYBxePThzIvRSgV3wVIYN8gFqkMO0=|1688443541; YD00517437729195%3AWM_TID=viqubj%2ByzcNEAQQBFQKQ10KuEzBFBq%2Fw; YD00517437729195%3AWM_NI=JwTUymyovorPwgUZoHdFS7xEKPwSVWvE2kGkegE2%2FXs%2BbaNEXzkGw%2BMTLU1Hnh0InX6PLo3JAh%2Fp4ipfCa29n8rZqShNVWeX92copvzUSz0mQJJ%2FrSn762KrYimXIdfHVVI%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6ee8fe564f6889b93c565a2bc8ab6d85a939a9badc56db4bf97b0e246f7b88eb2c82af0fea7c3b92a8aa99cb6fc67bbb1ba90f15cb5b99bbbee44a89a9dd8fb4f8990b9b4d55db69bfe8cc661a7b98bd3ae54b1f5ae82cd4f9ca6a5a8f6218f8c84b6b444aeb800aaed54f6908490b24283b8e191b763b4ba8d82b133aee8bebbf459ac929ab8e854fbeead90f37eba8b9d9bb367939686d0f159aee9fad3ca7a9bb7a3b9cc70a6989ab7cc37e2a3; __snaker__id=kU2jSIVgLwdBiosC; q_c1=07f28f63c8c341e38b8e77b1a3baadee|1709526862000|1709526862000; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1709175682,1709470462,1709526865; tst=r; SESSIONID=YiF7TOYcPqx8up2ja7VWg1rFOpIQJ4C0kxOFXpgAffB; JOID=VFkVCkpWcMxjjOUZYVkw0BgvMh5wCBiDXN-tQQsBBrVS8ZIsCgE3KAmM4xFgFBlzF9xHSAC72FpVUGsPRDJTT44=; osd=V10QAUxVdMloiuYdZFI20xwqORhzDB2IWtypRAAHBbFX-pQvDgQ8LgqI5hpmFx12HNpETAWw3llRVWAJRzZWRIg=; z_c0=2|1:0|10:1709526878|4:z_c0|80:MS4xY0RqOFNnQUFBQUFtQUFBQVlBSlZUWUZCeldiVDBweFVSTUppZ2t3cnN0ZWFEcHozbk9ZZTlRPT0=|a6c65f3e3c7bef8e605f054bb77d1b2a73ced4b9e47dc5c656ce5cc08975d3bc; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1709530226; unlock_ticket=AUASYgdHahcmAAAAYAJVTXlj5WVi2jfvOou3I0SLnFLDR_-HFDvhTA==; KLBRSID=d017ffedd50a8c265f0e648afe355952|1709530226|1709526862', 'referer':'https://www.zhihu.com/question/327436952/answer/1210845801', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' }
def answer(): socket.setdefaulttimeout(20) i = 0 while True: url = 'https://www.zhihu.com/api/v4/answers/1210845801/root_comments?limit=20\u0026offset={}\u0026order=normal\u0026status=open'.format( i) i += 20 try: req = urllib.request.Request(url=url, headers=headers) response = urllib.request.urlopen(req) html = response.read() jsonfile = json.loads(html) next_page = jsonfile['paging']['is_end'] print(next_page) for data in jsonfile['data']: id = data['id'] content = data['content'] author = data['author']['member']['name'] print(id, content, author) response.close() if next_page == True: break except urllib.error.URLError as e: print(e.reason) time.sleep(20)
answer()
|