当前位置:
首页 > Python基础教程 >
-
Python 常用笔记(2)
前10']] # 根据key分组显示index和数据
dfcod
a1.index = a1.index.droplevel() #删除一个多索引的index-names
# series 根据list 判断是否存在
df0[df0['id'].isin([3,4])] #根据list获取列表信息
df0[~df0['id'].isin([3,4])] #根据list获取列表信息 取反
# series 根据list 排序
df['words'] = df['words'].astype('category') #必须转换成这个格式
df['words'].cat.reorder_categories([1,2,3], inplace=True) # list长度相等用这个
df['words'].cat.set_categories([1,2,3], inplace=True) # list多 用这个
df['words'].cat.set_categories([1,2,3], inplace=True) # list少 用这个
df.sort_values('words', inplace=True)
#pandas 读写mysql
from sqlalchemy import create_engine
mysq = create_engine('mysql+pymysql://root:mysql.123@localhost/abdata?charset=utf8')
df.to_sql('coun',mysq,if_exists='append',index=False) # 追加数据
df.to_sql('counts',mysq,if_exists='replace',index=False) #删除并写入表
df = pd.read_sql_query('select * from cod1',mysq) # 查询mysql表
#pymysql读写mysql
import pymysql
conn = pymysql.connect('127.0.0.1', 'root', 'mysql.123', 'data',charset='utf8')
cur = conn.cursor()
sql1 = "SELECT * from (SELECT * from data1 ORDER BY id DESC LIMIT %s ) aa order by id" %sum
cur.execute(sql1)
c1 = cur.fetchall() #读取mysql
conn.commit() #写入mysql
cur.close()
conn.close()
DataFrame样式设置
def show(v): col = 'black' if v > 0 else 'green' return 'color:%s'%col def background_gradient(s, m, M, cmap='PuBu', low=0, high=0.8): rng = M - m norm = colors.Normalize(m - (rng * low),M + (rng * high)) normed = norm(s.values) c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)] return ['" style="color: rgb(128, 0, 0);">' % color for color in c] def highlight_max(s,m): is_max = s == m return ['" style="color: rgb(128, 0, 0);">' if v else '' for v in is_max] tabs.style.applymap(show).background_gradient(cmap='Reds',axis = 1,low = 0,high = 1,subset = set1).\ apply(background_gradient,cmap='Purples',m=tabs[set2].min().min(),M=tabs[set2].max().max(),low=0,high=1,subset = set2).\ apply(highlight_max,m=tabs[set2].max().max()).background_gradient(cmap='Wistia',axis = 1,subset=['总金额']) accdeteil.style.applymap(show).background_gradient(cmap='Reds',axis = 1,low = 0,high = 1).\ background_gradient(cmap='Reds',axis = 1,low = 0,high = 1 ,subset=set2).\ background_gradient(cmap='Purples',axis = 1,low = 0,high = 1,subset = pd.IndexSlice['前10',:'9']).\ background_gradient(cmap='Purples',axis = 1,low = 0,high = 1,subset = pd.IndexSlice['前20',:'9']).\ background_gradient(cmap='Purples',axis = 1,low = 0,high = 1,subset = pd.IndexSlice['前05','1_':]).\ background_gradient(cmap='Purples',axis = 1,low = 0,high = 1,subset = pd.IndexSlice['前15','1_':]).\ background_gradient(cmap='GnBu',axis = 0,low = 0,high = 1 ,subset=['SH_']).\ apply(highlight_max,m=tabs[set2].max().max()) #可参考 https://blog.csdn.net/xiaodongxiexie/article/details/71202279 #颜色样式 https://matplotlib.org/tutorials/colors/colormaps.html
pandas作图
import matplotlib.pyplot as plt ax1 = df1[['策略净值','指数净值']].plot(figsize=(15,8)) #dataframe折线图 ax1 = ax1.axhline(y=1,ls=":",c="r"),ax1.legend(loc = 'upper right') #标记0线和指定图例位置 plt.title('策略简单回测%s'%x,size=15) plt.xlabel('') for i in range(len(df1)): if df1['当天仓位'][i]==0 and df1['当天仓位'].shift(-1)[i]==1: plt.annotate('买',xy=(df1.index[i],df1.策略净值[i]),arrowprops=dict(facecolor='r',shrink=0.05)) #标记买卖点 if df1['当天仓位'][i]==0 and df1['当天仓位'].shift(1)[i]==1: plt.annotate('卖',xy=(df1.index[i],df1.策略净值[i]),arrowprops=dict(facecolor='g',shrink=0.1)) bbox = dict(boxstyle="round", fc="w", ec="0.5", alpha=0.9) #指定文字边框样式 t = f'累计收益率:策略{TA1}%,指数{TA2}%;\n年化收益率:策略{AR1}%,指数{AR2}%;'+\ f'\n最大回撤: 策略{MD1}%,指数{MD2}%;\n策略alpha: {round(alpha,2)},策略beta:{round(beta,2)}; \n夏普比率: {S}' plt.text(df1.index[0], df1['指数净值'].min(),text,size=13,bbox=bbox) #指定位置加文字框 ax=plt.gca() #设置图形样式 ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') plt.show()
爬虫
from bs4 import BeautifulSoup import requests headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } htm = requests.get(url=url,headers=headers,timeout=30,stream=False).text soup = BeautifulSoup(htm, 'html.parser') txt = soup.find_all('div', class_='lax-s') #txt = soup.find('div', class_='qi').children #etree方式获取 原文 https://mp.weixin.qq.com/s/c2Sg_LVTjOokePY2lxCGSA import requests import pandas as pd from pprint import pprint from lxml import etree import time import warnings warnings.filterwarnings("ignore") for i in range(1,15): print("正在爬取第" + str(i) + "页的数据") url = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE,2,"+str(i)+'.html?' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } web = requests.get(url, headers=headers) web.encoding = "gbk" dom = etree.HTML(web.text) #print(etree.tostring(dom, encoding="utf-8", pretty_print=True).decode("utf-8")) #打印整个html 不能直接print # 1、岗位名称 job_name = dom.xpath('//div[@class="dw_table"]/div[@class="el"]//p/span/a[@target="_blank"]/@title') # 2、公司名称 company_name = dom.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t2"]/a[@target="_blank"]/@title') # 3、工作地点 address = dom.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t3"]/text()') # 4、工资:工资这一列有空值,为了保证数据框的一致性。采取以下方式进行数据的获取 salary_mid = dom.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t4"]') salary = [i.text for i in salary_mid] #这里None也占一个元素 保持长度一致 # 5、发布日期 release_time = dom.xpath('//div[@class="dw_table"]/div[@class="el"]/span[@class="t5"]/text()') #----------------------------------------------------------------------------------------------# # 下面获取二级网址的信息。为了获取二级网址的信息,首先需要获取二级网址的url # 6、获取二级网址url deep_url = dom.xpath('//div[@class="dw_table"]/div[@class="el"]//p/span/a[@target="_blank"]/@href') RandomAll = [] JobDescribe = [] CompanyType = [] CompanySize = [] Industry = [] for i in range(len(deep_url)): web_test = requests.get(deep_url[i], headers=headers) web_test.encoding = "gbk" dom_test = etree.HTML(web_test.text) # 7、爬取经验、学历信息,先合在一个字段里面,以后再做数据清洗。命名为random_all random_all = dom_test.xpath('//div[@class="tHeader tHjob"]//div[@class="cn"]/p[@class="msg ltype"]/text()') # 8、岗位描述性息 job_describe = dom_test.xpath('//div[@class="tBorderTop_box"]//div[@class="bmsg job_msg inbox"]/p/text()') # 9、公司类型 company_type = dom_test.xpath('//div[@class="tCompany_sidebar"]//div[@class="com_tag"]/p[1]/@title') # 10、公司规模(人数) company_size = dom_test.xpath('//div[@class="tCompany_sidebar"]//div[@class="com_tag"]/p[2]/@title') # 11、所属行业(公司) industry = dom_test.xpath('//div[@class="tCompany_sidebar"]//div[@class="com_tag"]/p[3]/@title') # 将上述信息保存到各自的列表中 RandomAll.append(random_all) JobDescribe.append(job_describe) CompanyType.append(company_type) CompanySize.append(company_size) Industry.append(industry) # 为了反爬,设置睡眠时间 time.sleep(1) # 由于我们需要爬取很多页,为了防止最后一次性保存所有数据出现的错误,因此,我们每获取一夜的数据,就进行一次数据存取。 df = pd.DataFrame() df["岗位名称"] = job_name df["公司名称"] = company_name df["工作地点"] = address df["工资"] = salary df["发布日期"] = release_time df["经验、学历"] = RandomAll df["公司类型"] = CompanyType df["公司规模"] = CompanySize df["所属行业"] = Industry df["岗位描述"] = JobDescribe # 这里在写出过程中,有可能会写入失败,为了解决这个问题,我们使用异常处理。 try: df.to_csv("job_info.csv", mode="a+", header=None, index=None, encoding="gbk") except: print("当页数据写入失败") time.sleep(1) print("完毕")
OCR图片识别
#需要安装 tesseract-ocr(需要环境变量) 、chi_sim.traineddata 、 pytesseract-0.2.4 from PIL import Image import pytesseract,os,re png = r'D:\123\111.png' pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe' img = Image.open(png) tim = os.stat(png).st_mtime img1 = img.size aa = pytesseract.image_to_string(img, lang='chi_sim') print(img1,tim) print(aa)
webdriver自动化测试
#需要安装 chromedriver-v69 、ChromeSetup_64_69.exe from selenium import webdriver from selenium.webdriver.common.keys import Keys try: driver = webdriver.Chrome() driver.get("http://user/login") time.sleep(1) driver.find_element_by_id('username').send_keys('123123') driver.find_element_by_id('password').send_keys('123123') driver.find_element_by_id('login').click() time.sleep(2) driver.find_element_by_xpath('//*[@id="header"]/div[7]/div/div[1]/ul/li[4]/a').click() time.sleep(2) driver.find_elements_by_class_name('content')[2].click() time.sleep(2) s1 = driver.find_element_by_class_name('i1').text s2 = s1[3:6] s3 = driver.find_element_by_id('pre-kanjiang').text s4 = driver.find_element_by_xpath('//*[@id="money"]/strong').text s5 = driver.find_element_by_xpath('//*[@id="money"]/em').text print('key=', s2, 'time=', s3, s5 + '=', s4) fs.write('key=' + s2 + '\n' + 'time=' + s3 + '\n' + s5 + '=' + s4 + '\n') time.sleep(2) if int(s2) == int(s.get('key')): elements = driver.find_elements_by_class_name('code') if 'A' in s.keys(): data_values = s.get('A') for i in data_values: a_button_index = int(i) - 1 elements[a_button_index].click() print('a_button_index = ', a_button_index) fs.write('a_button_index = ' + str(a_button_index) + '\n') if 'B' in s.keys(): data_values = s.get('B') for j in data_values: b_button_index = int(j) + 9 elements[b_button_index].click() print('b_button_index = ', b_button_index) fs.write('b_button_index = ' + str(b_button_index) + '\n') if 'C' in s.keys(): data_values = s.get('C') for k in data_values: c_button_index = int(k) + 19 elements[c_button_index].click() print('c_button_index = ', c_button_index) fs.write('c_button_index = ' + str(c_button_index) + '\n') time.sleep(1) driver.find_elements_by_name('danwei')[1].click() driver.find_element_by_class_name('txt').clear() driver.find_element_by_class_name('txt').send_keys(int(s.get('T')) * 1) driver.find_element_by_class_name('tztj-hover').click() time.sleep(2) driver.find_element_by_class_name('tz-true-hover').click() time.sleep(2) driver.find_element_by_xpath("/html/body/div[2]/div[3]/div/button[1]").send_keys(Keys.ENTER) time.sleep(2) driver.quit() except Exception as e: print(e)
cs客户端自动化测试
import os,sys,time import pywinauto import pywinauto.clipboard import pywinauto.application import win32clipboard as wincb import win32con def winmax(): #窗口最大化
栏目列表
最新更新
nodejs爬虫
Python正则表达式完全指南
爬取豆瓣Top250图书数据
shp 地图文件批量添加字段
爬虫小试牛刀(爬取学校通知公告)
【python基础】函数-初识函数
【python基础】函数-返回值
HTTP请求:requests模块基础使用必知必会
Python初学者友好丨详解参数传递类型
如何有效管理爬虫流量?
SQL SERVER中递归
2个场景实例讲解GaussDB(DWS)基表统计信息估
常用的 SQL Server 关键字及其含义
动手分析SQL Server中的事务中使用的锁
openGauss内核分析:SQL by pass & 经典执行
一招教你如何高效批量导入与更新数据
天天写SQL,这些神奇的特性你知道吗?
openGauss内核分析:执行计划生成
[IM002]Navicat ODBC驱动器管理器 未发现数据
初入Sql Server 之 存储过程的简单使用
这是目前我见过最好的跨域解决方案!
减少回流与重绘
减少回流与重绘
如何使用KrpanoToolJS在浏览器切图
performance.now() 与 Date.now() 对比
一款纯 JS 实现的轻量化图片编辑器
关于开发 VS Code 插件遇到的 workbench.scm.
前端设计模式——观察者模式
前端设计模式——中介者模式
创建型-原型模式