fi = open("sensor.txt", "r", encoding="utf-8")
fo = open("earpa001.txt", "w")
for line in fi:
ls = line.strip("\n").split(",")
if ls[1].count("earpa001") > 0:
fo.write('{},{},{},{}\n'.format(ls[0], ls[1], ls[2], ls[3]))
fi.close()
fo.close()
第 2 小问:设备型号统计与排序
任务要求:统计设备型号组合出现次数,按频次降序输出。
fi = open("earpa001.txt", "r")
fo = open("earpa001_count.txt", "w")
d = {}
for line in fi:
ls = line.strip("\n").split(",")
m = ls[2] + "-" + ls[3]
d[m] = d.get(m, 0) + 1
ls = list(d.items())
ls.sort(key=lambda x: x[1], reverse=True)
for i inrange((ls)):
fo.write(.(ls[i][], ls[i][]))
fi.close()
fo.close()
len
'{},{}\n'
format
0
1
第 2 套题
第 1 小问:《论语》原文提取
任务要求:从文件中提取标记为【原文】的内容。
fi = open("论语.txt", "r")
fo = open("论语 - 原文.txt", "w")
flag = Falsefor line in fi:
if"【原文】"in line:
flag = Truecontinueif"【注释】"in line:
flag = False
line = line.strip(" \n")
if flag and line:
fo.write(line + "\n")
fi.close()
fo.close()
第 2 小问:去除数字括号
任务要求:清理文本中的 (1) 到 (30) 格式字符。
fi = open("论语 - 原文.txt", "r")
fo = open("论语 - 提纯原文.txt", "w")
for line in fi:
for i inrange(0, 30):
line = line.replace('({})'.format(i), '')
fo.write(line)
fi.close()
fo.close()
第 3 套题
第 1 小问:星座日期查询
任务要求:根据输入的星座名称查找生日区间。
f = open("PY301-SunSign.csv")
name = input("请输入星座中文名称")
for line in f.read().split("\n"):
ls = []
if name in line:
ls = line.split(",")
print("{}的生日位于{}-{}之间".format(ls[1], ls[2], ls[3]))
f.close()
第 2 小问:序号匹配与日期格式化
任务要求:支持输入序号,处理月份日期的不同长度格式。
f = open("py301-sunsign.csv", "r")
x = input("请输入星座序号(例如,5):")
ls = []
for line in f:
ls.append(line.strip('\n').split(','))
num = x.split()
for i in num:
for row in ls:
if row[0] == i:
iflen(row[2]) == 3:
m1, d1 = row[2][0], row[2][1:3]
else:
m1, d1 = row[2][0:2], row[2][2:4]
iflen(row[3]) == 3:
m2, d2 = row[3][0], row[3][1:3]
else:
m2, d2 = row[3][0:2], row[3][2:4]
print("{}({})的生日是{}月{}日至{}月{}日之间".format(row[1], row[4], m1, d1, m2, d2))
f.close()
第 3 小问:增强版序号校验
任务要求:增加输入合法性检查,防止越界访问。
f = open("py301-sunsign.csv", 'r')
ls = []
for line in f:
ls.append(line.strip(' \n').split(','))
f.close()
x = input("请输入星座序号(例如,5):")
num = x.strip(' \n').split()
for i in num:
if0 < int(i) < len(ls):
for row in ls:
if row[0] == i:
m1 = row[2][0] iflen(row[2]) == 3else row[2][0:2]
d1 = row[2][1:3] iflen(row[2]) == 3else row[2][2:4]
m2 = row[3][0] iflen(row[2]) == 3else row[3][0:2]
d2 = row[3][1:3] iflen(row[2]) == 3else row[3][2:4]
print("{}({})的生日是{}月{}日至{}月{}日之间".format(row[1], row[4], m1, d1, m2, d2))
else:
print("输入星座序号有误!")
第 4 套题
第 1 小问:字符频率统计
任务要求:统计文本中非标点符号字符的出现次数。
f = open('命运.txt', 'r')
d = {}
for i in f.read():
if i notin",。?!《》【】''''":
d[i] = d.get(i, 0) + 1
ls = list(d.items())
ls.sort(key=lambda x: x[1], reverse=True)
print("{}:{}".format(ls[0][0], ls[0][1]))
f.close()
第 2 小问:高频词前缀输出
任务要求:输出出现频率最高的 10 个字符。
f = open('命运.txt', 'r')
d = {}
for i in f.read():
if i notin",:。?!《》【】''''\"\n":
d[i] = d.get(i, 0) + 1
ls = list(d.items())
ls.sort(key=lambda x: x[1], reverse=True)
for i inrange(10):
print(ls[i][0], end="")
f.close()
第 3 小问:频次排序写入文件
任务要求:将统计结果写入新文件,去除换行符干扰。
f = open('命运.txt', 'r')
fi = open('命运 - 频次排序.txt', 'w')
d = {}
for i in f.read():
if i notin"\n":
d[i] = d.get(i, 0) + 1
ls = list(d.items())
ls.sort(key=lambda x: x[1], reverse=True)
s = ""for k in ls:
s += "{}:{}".format(k[0], k[1]) + ','
fi.write(s[:-1])
f.close()
fi.close()
第 5 套题
第 1 小问:文本清洗与分词
任务要求:使用 jieba 进行分词并保存。
import jieba
f = open('data.txt', 'r')
lines = f.readlines()
f.close()
f = open('out.txt', 'w')
for line in lines:
line = line.strip(' ') # 删除每行首尾可能出现的空格
wordList = jieba.lcut(line) # 用结巴分词,对每行内容进行分词
f.writelines('\n'.join(wordList)) # 将分词结果存到文件 out.txt 中
f.close()
第 2 小问:特定词频统计
任务要求:统计指定词语(如'曹操')的出现次数。
import jieba
f = open('out.txt', 'r')
words = f.readlines()
f.close()
D = {}
for w in words:
D[w[:-1]] = D.get(w[:-1], 0) + 1print("曹操出现次数为:{} ".format(D["曹操"]))
第 6 套题
第 1 小问:去重长词提取
任务要求:提取长度大于等于 3 且未重复的词。
import jieba
fi = open('data.txt', 'r')
f = open('out1.txt', 'w')
words = []
for line in fi.readlines():
line = line.strip('\n')
wordlist = jieba.lcut(line)
for word in wordlist:
iflen(word) >= 3and (word notin words):
words.append(word)
for word in words:
f.write(word + '\n')
fi.close()
f.close()
第 2 小问:词频排序输出
任务要求:统计词频并排序输出。
import jieba
fi = open('data.txt', 'r')
fo = open('out2.txt', 'w')
words = []
for line in fi.readlines():
line = line.strip('\n')
wordlist = jieba.lcut(line)
for word in wordlist:
iflen(word) >= 3:
words.append(word)
d = {}
for word in words:
d[word] = d.get(word, 0) + 1
ls = list(d.items())
ls.sort(key=lambda x: x[1], reverse=True)
s = ''for i in ls:
s = '{}:{}\n'.format(i[0], i[1])
fo.write(s)
fi.close()
fo.close()
第 7 套题
第 1 小问:成绩计算与排名
任务要求:计算总分并按降序排列,输出前 10 名。
fi = open("score301.txt")
L = []
for f in fi:
st = f.strip("\n").split()
grade = sum(list(map(lambda x: eval(x), st[2:])))
st.append(grade)
L.append(st)
L.sort(key=lambda x: x[-1], reverse=True)
fo = open("cand301.txt", "w")
for s in L[0:10]:
fo.write(" ".join(s[:-1]) + "\n")
fi.close()
fo.close()
第 2 小问:单科及格筛选
任务要求:筛选所有科目均及格的学生。
''' 输入文件:cand301.txt 输出文件:best301.txt '''
fi = open("cand301.txt", "r")
fo = open("best301.txt", "w")
for i in fi:
s = i.strip("\n").split()
ifmin(list(map(lambda x: eval(x), s[2:]))) >= 60:
fo.write(" ".join(s[:2]) + "\n")
fi.close()
fo.close()
第 8 套题
第 1 小问:HTML 属性提取
任务要求:从 HTML 中提取 alt 属性值。
fo = open('data.txt', 'r')
lines = fo.read().split('\n')
fo.close()
L = []
for line in lines:
if"alt="in line:
L.append(line)
S = []
for line in L:
point_start = line.find('alt=') + 5
point_end = line.find('"', point_start, -1)
S.append(line[point_start:point_end])
f = open("univ.txt", "w")
for school in S:
f.write(school)
f.write('\n')
f.close()
第 2 小问:大学学院名称统计
任务要求:区分统计包含'大学'和'学院'的名称数量。
n = 0
k = 0
f = open("univ.txt", "r")
lines = f.read().split('\n')
f.close()
for school in lines:
if (("大学"in school) or ("学院"in school)) and ("大学生"notin school):
print(school)
if"大学"in school:
n += 1elif"学院"in school:
k += 1print("包含大学的名称数量是{}".format(n))
print("包含学院的名称数量是{}".format(k))
第 9 套题
第 1 小问:两年词频对比
任务要求:分别统计 2018 和 2019 年词频前 10。
import jieba
f2018 = open('data2018.txt', 'r')
line2018 = f2018.read().split('\n')
f2018.close()
f2019 = open('data2019.txt', 'r')
line2019 = f2019.read().split('\n')
f2019.close()
d = {}
for i in line2018:
word = jieba.lcut(i)
for j in word:
iflen(j) >= 2:
d[j] = d.get(j, 0) + 1
lt = list(d.items())
lt.sort(key=lambda x: x[1], reverse=True)
print('2018:', end='')
for i inrange(9):
print('{}:{}'.format(lt[i][0], lt[i][1]), end='')
print(',', end='')
print('{}:{}'.format(lt[9][0], lt[9][1]))
d = {}
for i in line2019:
word = jieba.lcut(i)
for j in word:
iflen(j) >= 2:
d[j] = d.get(j, 0) + 1
lt = list(d.items())
lt.sort(key=lambda x: x[1], reverse=True)
print('2019:', end='')
for i inrange(9):
print('{}:{}'.format(lt[i][0], lt[i][1]), end='')
print(',', end='')
print('{}:{}'.format(lt[9][0], lt[9][1]))
第 2 小问:特有词分析
任务要求:找出两年共有的词以及各自特有的词。
import jieba
f2018 = open('data2018.txt', 'r')
line2018 = f2018.read().split('\n')
f2018.close()
f2019 = open('data2019.txt', 'r')
line2019 = f2019.read().split('\n')
f2019.close()
d = {}
for i in line2018:
word = jieba.lcut(i)
for j in word:
iflen(j) >= 2:
d[j] = d.get(j, 0) + 1
lt = list(d.items())
lt.sort(key=lambda x: x[1], reverse=True)
ls2018 = []
for i inrange(10):
ls2018.append(lt[i][0])
d = {}
for i in line2019:
word = jieba.lcut(i)
for j in word:
iflen(j) >= 2:
d[j] = d.get(j, 0) + 1
lt = list(d.items())
lt.sort(key=lambda x: x[1], reverse=True)
ls2019 = []
for i inrange(10):
ls2019.append(lt[i][0])
lslike = []
for i in ls2018:
if i in ls2019:
lslike.append(i)
for i in lslike:
ls2018.remove(i)
ls2019.remove(i)
print('共有词语:', end='')
for i in lslike[:-1]:
print(i, end=',')
print(lslike[-1])
print('2019 特有:', end='')
for i in ls2019[:-1]:
print(i, end=',')
print(ls2019[-1])
print('2018 特有:', end='')
for i in ls2018[:-1]:
print(i, end=',')
print(ls2018[-1])
第 10 套题
第 1 小问:特殊字符清洗
任务要求:移除指定的标点符号。
import jieba
f = open('data.txt', 'r')
data = f.read()
f.close()
f = open('clean.txt', 'w')
s = ''
x = ',。?、''"";:、)\n(!'for i in data:
if i notin x:
s += i
f.write(s)
f.close()
第 2 小问:长词频统计
任务要求:统计长度大于等于 3 的词频。
import jieba
f = open('clean.txt', 'r')
data = f.read()
l = jieba.lcut(data)
d = {}
for i in l:
iflen(i) >= 3:
d[i] = d.get(i, 0) + 1
lt = list(d.items())
lt.sort(key=lambda x: x[1], reverse=True)
for i in lt[:9]:
print(i[0], ':', i[1], end=',', sep='')
print(lt[9][0], ':', lt[9][1], sep='')
f.close()
第 11 套题
例题:红楼梦文本分析
任务要求:统计标点、唯一词数及高频词。
import jieba
fp = open("红楼梦.txt", encoding='utf-8')
ss = fp.read()
fp.close()
point = ",。:;?"
cnt = 0for i in ss:
if i in point:
cnt += 1print(cnt)
words = jieba.lcut(ss)
dc = []
for i in words:
iflen(i) >= 2:
dc.append(i)
dc1 = set(dc)
print(len(dc1))
d = {}
for i in dc:
d[i] = d.get(i, 0) + 1
lt = list(d.items())
lt.sort(key=lambda x: x[1], reverse=True)
for x in lt[0:2]:
print("{},{}".format(x[0], x[1]))
第 12 套题
例题:八十天环游地球
任务要求:统计非空行、字符数、词语数及章节提取。
import jieba
fs = open("八十天环游地球.txt", "r")
lss = fs.readlines()
dels = ' "?!:,。'
lens = 0
new_list = []
for lr in lss:
if lr != "\n":
lens += 1
new_str = ""for i inrange(len(lr)):
if lr[i] in dels:
passelse:
new_str += lr[i]
new_list.append(new_str)
print("共{}个非空行".format(lens))
alens = 0
wlens = 0
word_list = []
for lr in lss:
alens += len(lr)
words = jieba.lcut(lr)
for i in words:
word_list.append(i)
wlens = len(word_list)
print("剩余字符数{}, 词语数{}".format(alens, wlens))
fo = open("八十天环游地球 - 章节.txt", "w")
for lr in lss:
if"章 "in lr:
fo.write(lr)
fo.close()
第 13 套题
例题:学生成绩处理
任务要求:统计人数、计算平均分、输出分数信息。
# 第一问:统计素材文件中学生人数
f1 = open('data301.txt', 'rt')
stu_lst = f1.readlines()
print("素材文件中学生的人数是{}".format(len(stu_lst)))
f1.close()
# 第二问:计算所有学生的平均分
stu_dic = {}
for stu_str in stu_lst:
if'\n'in stu_str:
stu_str = stu_str.replace('\n', '')
stu = stu_str.split(':')
name = stu[0]
score = stu[1].split(',')[1]
stu_dic[name] = score
avg = sum([int(score) for score in stu_dic.values()]) / len(stu_dic)
print("所有学生的平均分是{:.1f}".format(avg))
# 第三问:输出学生分数信息到 result 文件中
name_score = list(stu_dic.items())
name_score_lst = [item[0] + "," + item[1] + "\n"for item in name_score]
f2 = open('result301.txt', 'wt')
for i in name_score_lst:
f2.write(i)
f2.close()
第 14 套题
例题:图片链接提取
任务要求:统计图片数量并提取完整 URL。
fi = open("data301.txt", "r")
ss = fi.read()
print(ss.count("<a"))
fi.seek(0)
count = 0
ls = []
flag = 0for i in ss.split("\n"):
if".JPG"in i:
count += 1
fi.close()
print(count)
fp = open("images.txt", "w")
for i in ss.split("\n"):
if".JPG"in i:
start_point = i.find("http://")
end_point = i.find(".JPG")
fp.write(i[start_point:end_point+4] + "\n")
fp.close()
第 15 套题
例题:停用词过滤与句子定位
任务要求:排除停用词,统计词频,定位最大词所在句子。
stop_word = ['我们','同时','之后','更好','这些','进行']
# 第一问:读文件,统计文件中的字符数
f1 = open('data301.txt', 'rt')
txt = f1.read()
print("素材文件字符个数是{}。".format(len(txt)))
f1.close()
# 第二问:统计词频,输出长度大于 1 的词的个数,排除特殊词import jieba
txt_wordslist = jieba.lcut(txt)
count = 0
word_count = {}
for word in txt_wordslist:
if (len(word) > 1) and (word notin stop_word):
word_count[word] = word_count.get(word, 0) + 1print("长度大于 1 且不相同的词的个数是{}。".format(len(word_count.items())))
wordlist = list(word_count.items())
wordlist.sort(key=lambda x: x[1], reverse=True)
for i in wordlist:
if i[0] notin stop_word:
topword = i[0]
breakprint("词频最大的词是:{}".format(topword))
# 第三问:将长度大于 1 并且词频最大的词所在的句子,排除特殊词
sentence_list = txt.split('。')
fo = open("out301.txt", "w")
for sentence in sentence_list:
if topword in sentence:
fo.write(sentence + "\n")
fo.close()
第 16 套题
例题:疫情数据统计
任务要求:解析 JSON 风格文本,统计确诊人数及分布。
fi = open("data301.txt", "r")
dc = {}
name = ''
count = 0
flag = 1for line in fi:
if'"name":'in line:
name = line.split(':')[1].strip(' ,"\n')
flag = 1elif'"value":'in line and flag == 1:
dx = int(line.split(':')[1].strip(' \n'))
dc[name] = dx
flag = 0
count += 1
fi.close()
print("一共有{}个国家".format(count))
lt = list(dc.items())
lt.sort(key=lambda x: x[1], reverse=True)
print("确诊人数最多的国家是{},人数是{}".format(lt[0][0], lt[0][1]))
lw = 0
lz = 0for i in lt:
if i[1] >= 10000:
lw += 1elif i[1] == 0:
lz += 1print("确诊人数超过 1W 的国家有{}个".format(lw))
print("确诊人数为 0 的国家有{}个".format(lz))
第 17 套题
例题:CSV 数据分析
任务要求:读取 CSV,计算疑似重症治愈的总数与最大值,筛选高于平均值的地区。
# 问题 1:withopen("data301.csv", "r") as f:
lines = f.read().split("\n")
for i inrange(5):
print(lines[i])
# 问题 2:deftotal(lst):
sum_val = 0for i in lst:
sum_val += i
return sum_val
yslst = []
zzlst = []
zylst = []
for line in lines[1:]:
lst = line.split(',')
if lst != [""]:
yslst.append(eval(lst[1]))
zzlst.append(eval(lst[2]))
zylst.append(eval(lst[3]))
print('统计,疑似,重症,治愈')
print('总数,{}, {}, {}'.format(total(yslst), total(zzlst), total(zylst)))
print('最大值,{}, {}, {}'.format(max(yslst), max(zzlst), max(zylst)))
# 问题 3:
d = {}
ls = []
for line in lines[1:-1]:
lst = line.split(',')
d[lst[0]] = eval(lst[1])
l = list(d.items())
l.sort(key=lambda x: x[1], reverse=True)
avg = total(yslst) / (len(lines) - 2)
for line in l:
if line[1] > avg:
ls.append(line[0])
print(','.join(ls))
第 18 套题
例题:标点与字词统计
任务要求:统计标点数量,按字数分类统计词频。
import jieba
fs = open("data301.txt", "r")
lss = fs.read()
fs.close()
for c in",。?!:":
print('"{}"的个数为{}个'.format(c, lss.count(c)))
ds = {}
words = jieba.lcut(lss)
for w in words:
ds[len(w)] = ds.get(len(w), [])
ds[len(w)].append(w)
for i inrange(2, 6):
print('{}字词有{}个'.format(i, len(ds[i])))
ls = ds[i]
d = {}
for j in ls:
d[j] = d.get(j, 0) + 1
lt = list(d.items())
lt.sort(key=lambda x: x[1], reverse=True)
print(lt[0][0])
第 19 套题
例题:发言记录分析
任务要求:统计行数、发言人及每人高频词。
import jieba
fs = open("data301.txt", "r")
lss = fs.readlines()
fs.close()
lens = 0for lr in lss:
if lr != "\n":
lens += 1print("共{}个非空行。".format(lens))
ts = {}
for lr in lss:
if lr.strip() != "":
words = lr.strip().split(":")
ts[words[0]] = ts.get(words[0], [])
ts[words[0]].append(words[1])
names = ts.keys()
print("共{}个人发言:{}".format(len(ts), ",".join(names)))
for r in ts.keys():
words = []
for i in ts[r]:
words += jieba.lcut(i)
d = {}
for i in words:
iflen(i) > 1:
d[i] = d.get(i, 0) + 1
lt = list(d.items())
lt.sort(key=lambda x: x[1], reverse=True)
print("{}说了{}个词,最多的词是:{}".format(r, len(lt), lt[0][0]))