import os
deffind_file(work_dir, extension='jpg'):
lst = []
for filename in os.listdir(work_dir):
splits = os.path.splitext(filename)
ext = splits[1]
if ext == '.' + extension:
lst.append(filename)
return lst
# find_file('.', 'md') # 返回所有目录下的 md 文件
七、正则和爬虫
1. 爬取天气数据并解析温度值
演示 requests、lxml 与 pandas 的结合使用。
import requests
from lxml import etree
import pandas as pd
import re
url = 'http://www.weather.com.cn/weather1d/101010100.shtml'try:
with requests.get(url, timeout=10) as res:
content = res.content
html = etree.HTML(content)
location = html.xpath('//*[@id="around"]//a[@target="_blank"]/span/text()')
temperature = html.xpath('//*[@id="around"]/div/ul/li/a/i/text()')
df = pd.DataFrame({'location': location, 'temperature': temperature})
defparse_temp(temp_str):
try:
high = int(re.match(r'(-?[0-9]*?)/-?[0-9]*?°C', temp_str).group(1))
low = int(re.match(r'-?[0-9]*?/(-?[0-9]*?)°C', temp_str).group(1))
return high, low
except:
returnNone, None
temps = df['temperature'].apply(parse_temp)
df['high'] = [t[0] for t in temps]
df['low'] = [t[1] for t in temps]
print(df.head())
except Exception as e:
print(f"请求失败:{e}")
2. 批量转化驼峰格式
字符串命名规范转换工具。
import re
defcamel(s):
s = re.sub(r"(\s|_|-)+", " ", s).title().replace(" ", "")
return s[0].lower() + s[1:]
defbatch_camel(slist):
return [camel(s) for s in slist]
print(batch_camel(['student_id', 'student\tname', 'student-add'])) # ['studentId', 'studentName', 'studentAdd']
import turtle
import random
defdraw_snowflakes(count=100):
t = turtle.Turtle()
t.speed(0)
for _ inrange(count):
x = random.randint(-300, 300)
y = random.randint(-300, 300)
t.penup()
t.goto(x, y)
t.dot(3, 'white')
turtle.done()
3. 词频云图
使用 wordcloud 库可视化文本数据。
import hashlib
import pandas as pd
from wordcloud import WordCloud
# 假设 geo_data 已加载# geo_data = pd.read_excel(r"../data/geo_data.xlsx")# words = ','.join(x for x in geo_data['city'] if x != [])# wc = WordCloud(# background_color="green",# max_words=100,# font_path='./fonts/simhei.ttf',# width=500# )# x = wc.generate(words)# x.to_file('../data/geo_data.png')
九、生成器
1. 求斐波那契数列前 n 项 (生成器版)
使用 yield 关键字节省内存。
deffibonacci_gen(n):
a, b = 1, 1for _ inrange(n):
yield a
a, b = b, a + b
print(list(fibonacci_gen(5))) # [1, 1, 2, 3, 5]
2. 将 list 等分为子组 (生成器版)
惰性计算,适合大数据流处理。
from math import ceil
defdivide_iter(lst, n):
if n <= 0:
yield lst
return
i, div = 0, ceil(len(lst) / n)
while i < n:
yield lst[i * div: (i + 1) * div]
i += 1print(list(divide_iter([1, 2, 3, 4, 5], 2))) # [[1, 2, 3], [4, 5]]
十、Keras 入门
1. Keras 入门例子
构建简单的神经网络模型。
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
# 生成模拟数据
np.random.seed(42)
data = np.random.random((1000, 100))
labels = np.random.randint(2, size=(1000, 1))
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=100))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(data, labels, epochs=10, batch_size=32, verbose=0)
predictions = model.predict(data)
print(f"预测样本数:{len(predictions)}")