def get_jp_text(path): with open(path, "rb") as f: # 获得文件编码 full_data = f.read() chaInfo = chardet.detect(full_data) if chaInfo["encoding"].upper() in ["GB2312", "GBK"]: chaInfo["encoding"] = "GB18030"
# 将日文存为列表 f.seek(0) list_data = f.readlines() jp_texts = [] for data in list_data: text = data.decode(chaInfo["encoding"]) jp_texts.append(text.rstrip("\n").rstrip("\r"))
# 将jp文本转换为字典 """ 发现一个bug,用json会将相同的key去掉 但小说中有啊啊啊之类的相同段 所以用笨办法安全点 """ with open("TransFile.json", "w", encoding="UTF-8") as f: f.write("{\n") for text in jp_texts: if text != jp_texts[-1]: f.write(f""" "{text}":"{text}",\n""") else: f.write(f""" "{text}":"{text}"\n""") f.write("}")
def zh_novel(path): with open(path, "r", encoding="utf-8") as f: """ 这里原本也是用json实现的,不过相同键会被删除,改成正则吧 """ data = f.readlines() data = data[1:-1] new_data = [] for text in data: zh_text = re.search(r'".+"\:"(.+)",{0,1}$', text) new_data.append(zh_text[1]) with open("中文翻译.txt", "w", encoding="utf-8") as f: for data in new_data: f.write(f"{data}\n")
if __name__ == "__main__": # 将路径换成你的文件路径 get_jp_text(r"C:\人間失格.txt") # 将txt转为字典
def get_jp_text(path): with open(path, "rb") as f: .......
基本是这样吧,应该不用改,chardet是第三方库,用来获得编码的,不过,你用爬虫爬下来的文件一般自己知道编码,而且编码统一,这样可以直接改成 with open(path, "r",encoding="你文件的编码") as f: list_data = f.readlines() jp_texts = [] for data in list_data: jp_texts.append(text.rstrip("\n").rstrip("\r")) # 将jp文本转换为字典 jp_dict = {text: text for text in jp_texts} with open("TransFile.json", "w", encoding="UTF-8") as f: json.dump(jp_dict, f, ensure_ascii=False, indent=4)