02. TensorFlow簡單的聊天機器人 · 深度學習和人工智能 AI.book.yanxishe.cc

TF簡單聊天機器人 ========== .... 參考資料 -- TF聊天機器人： http://blog.topspeedsnail.com/archives/10735 騰訊分詞服務： http://nlp.qq.com/ 博森NLP： http://bosonnlp.com/ 0數據集 -- 影視對白數據庫下載：百度云： http://pan.baidu.com/s/1hsfYAES github： https://github.com/rustch3n/dgk_lost_conv/blob/master/dgk_shooter_min.conv.zip --------------- 1數據預處理 -- 1data_pre.py ~~~ import os import random conv_path = 'dgk_shooter_min.conv' if not os.path.exists(conv_path): print('數據集不存在') exit() # 數據集格式 """ E M 畹/華/吾/侄/ M 你/接/到/這/封/信/的/時/候/ M 不/知/道/大/伯/還/在/不/在/人/世/了/ E M 咱/們/梅/家/從/你/爺/爺/起/ M 就/一/直/小/心/翼/翼/地/唱/戲/ M 侍/奉/宮/廷/侍/奉/百/姓/ M 從/來/不/曾/遭/此/大/禍/ M 太/后/的/萬/壽/節/誰/敢/不/穿/紅/ M 就/你/膽/兒/大/ M 唉/這/我/舅/母/出/殯/ M 我/不/敢/穿/紅/啊/ M 唉/呦/唉/呦/爺/ M 您/打/得/好/我/該/打/ M 就/因/為/沒/穿/紅/讓/人/賞/咱/一/紙/枷/鎖/ M 爺/您/別/給/我/戴/這/紙/枷/鎖/呀/ E M 您/多/打/我/幾/下/不/就/得/了/嗎/ M 走/ M 這/是/哪/一/出/啊/…/ / /這/是/ M 撕/破/一/點/就/弄/死/你/ M 唉/ M 記/著/唱/戲/的/再/紅/ M 還/是/讓/人/瞧/不/起/ M 大/伯/不/想/讓/你/挨/了/打/ M 還/得/跟/人/家/說/打/得/好/ M 大/伯/不/想/讓/你/再/戴/上/那/紙/枷/鎖/ M 畹/華/開/開/門/哪/ E ... """ # 我首先使用文本編輯器sublime把dgk_shooter_min.conv文件編碼轉為UTF-8，一下子省了不少麻煩 convs = [] # 對話集合 with open(conv_path, encoding = "utf8", errors='ignore') as f: one_conv = [] # 一次完整對話 for line in f: line = line.strip('\n').replace('/', '') if line == '': continue if line[0] == 'E': if one_conv: convs.append(one_conv) one_conv = [] elif line[0] == 'M': one_conv.append(line.split(' ')[1]) # 把對話分成問與答 ask = [] # 問 response = [] # 答 for conv in convs: if len(conv) == 1: continue if len(conv) % 2 != 0: # 奇數對話數, 轉為偶數對話 conv = conv[:-1] for i in range(len(conv)): if i % 2 == 0: ask.append(conv[i]) else: response.append(conv[i]) """ print(len(ask), len(response)) print(ask[:3]) print(response[:3]) ['畹華吾侄', '咱們梅家從你爺爺起', '侍奉宮廷侍奉百姓'] ['你接到這封信的時候', '就一直小心翼翼地唱戲', '從來不曾遭此大禍'] """ def convert_seq2seq_files(questions, answers, TESTSET_SIZE = 1000): # 創建文件 train_enc = open('train.enc','w') # 問 train_dec = open('train.dec','w') # 答 test_enc = open('test.enc', 'w') # 問 test_dec = open('test.dec', 'w') # 答 # 選擇1000數據作為測試數據 test_index = random.sample([i for i in range(len(questions))],TESTSET_SIZE) for i in range(len(questions)): if i in test_index: test_enc.write(questions[i]+'\n') test_dec.write(answers[i]+ '\n' ) else: train_enc.write(questions[i]+'\n') train_dec.write(answers[i]+ '\n' ) if i % 1000 == 0: print(len(range(len(questions))), '處理進度:', i) train_enc.close() train_dec.close() test_enc.close() test_dec.close() convert_seq2seq_files(ask, response) # 生成的*.enc文件保存了問題 # 生成的*.dec文件保存了回答 ~~~ 2創建詞匯表，對話轉為向量 -- 生成的train_encode.vec和train_decode.vec用于訓練，對應的詞匯表是train_encode_vocabulary和train_decode_vocabulary。 2word2vec.py ~~~ # 前一步生成的問答文件路徑 train_encode_file = 'train.enc' train_decode_file = 'train.dec' test_encode_file = 'test.enc' test_decode_file = 'test.dec' print('開始創建詞匯表...') # 特殊標記，用來填充標記對話 PAD = "__PAD__" GO = "__GO__" EOS = "__EOS__" # 對話結束 UNK = "__UNK__" # 標記未出現在詞匯表中的字符 START_VOCABULART = [PAD, GO, EOS, UNK] PAD_ID = 0 GO_ID = 1 EOS_ID = 2 UNK_ID = 3 # 參看tensorflow.models.rnn.translate.data_utils vocabulary_size = 500 # 生成詞匯表文件 def gen_vocabulary_file(input_file, output_file): vocabulary = {} with open(input_file) as f: counter = 0 for line in f: counter += 1 tokens = [word for word in line.strip()] for word in tokens: if word in vocabulary: vocabulary[word] += 1 else: vocabulary[word] = 1 vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True) # 取前5000個常用漢字, 應該差不多夠用了 if len(vocabulary_list) > 500: vocabulary_list = vocabulary_list[:500] print(input_file + " 詞匯表大小:", len(vocabulary_list)) with open(output_file, "w") as ff: for word in vocabulary_list: ff.write(word + "\n") gen_vocabulary_file(train_encode_file, "train_encode_vocabulary") gen_vocabulary_file(train_decode_file, "train_decode_vocabulary") train_encode_vocabulary_file = 'train_encode_vocabulary' train_decode_vocabulary_file = 'train_decode_vocabulary' print("對話轉向量...") # 把對話字符串轉為向量形式 def convert_to_vector(input_file, vocabulary_file, output_file): tmp_vocab = [] with open(vocabulary_file, "r") as f: tmp_vocab.extend(f.readlines()) tmp_vocab = [line.strip() for line in tmp_vocab] vocab = dict([(x, y) for (y, x) in enumerate(tmp_vocab)]) #{'碩': 3142, 'v': 577, 'Ｉ': 4789, '\ue796': 4515, '拖': 1333, '疤': 2201 ...} output_f = open(output_file, 'w') with open(input_file, 'r') as f: for line in f: line_vec = [] for words in line.strip(): line_vec.append(vocab.get(words, UNK_ID)) output_f.write(" ".join([str(num) for num in line_vec]) + "\n") output_f.close() convert_to_vector(train_encode_file, train_encode_vocabulary_file, 'train_encode.vec') convert_to_vector(train_decode_file, train_decode_vocabulary_file, 'train_decode.vec') convert_to_vector(test_encode_file, train_encode_vocabulary_file, 'test_encode.vec') convert_to_vector(test_decode_file, train_decode_vocabulary_file, 'test_decode.vec') ~~~ 3訓練 -- 3tftrain.py ~~~ import tensorflow as tf # 0.12 from tensorflow.models.rnn.translate import seq2seq_model import os import numpy as np import math PAD_ID = 0 GO_ID = 1 EOS_ID = 2 UNK_ID = 3 train_encode_vec = 'train_encode.vec' train_decode_vec = 'train_decode.vec' test_encode_vec = 'test_encode.vec' test_decode_vec = 'test_decode.vec' # 詞匯表大小5000 vocabulary_encode_size = 500 vocabulary_decode_size = 500 buckets = [(5, 10), (10, 15), (20, 25), (40, 50)] layer_size = 256 # 每層大小 num_layers = 3 # 層數 batch_size = 64 # 讀取*dencode.vec和*decode.vec數據（數據還不算太多, 一次讀人到內存） def read_data(source_path, target_path, max_size=None): data_set = [[] for _ in buckets] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(EOS_ID) for bucket_id, (source_size, target_size) in enumerate(buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set model = seq2seq_model.Seq2SeqModel(source_vocab_size=vocabulary_encode_size, target_vocab_size=vocabulary_decode_size, buckets=buckets, size=layer_size, num_layers=num_layers, max_gradient_norm= 5.0, batch_size=batch_size, learning_rate=0.5, learning_rate_decay_factor=0.97, forward_only=False) config = tf.ConfigProto() config.gpu_options.allocator_type = 'BFC' # 防止 out of memory with tf.Session(config=config) as sess: # 恢復前一次訓練 ckpt = tf.train.get_checkpoint_state('.') if ckpt != None: print(ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) train_set = read_data(train_encode_vec, train_decode_vec) test_set = read_data(test_encode_vec, test_decode_vec) train_bucket_sizes = [len(train_set[b]) for b in range(len(buckets))] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes))] loss = 0.0 total_step = 0 previous_losses = [] # 一直訓練，每過一段時間保存一次模型 while True: random_number_01 = np.random.random_sample() bucket_id = min([i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) encoder_inputs, decoder_inputs, target_weights = model.get_batch(train_set, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) loss += step_loss / 500 total_step += 1 print(total_step) if total_step % 500 == 0: print(model.global_step.eval(), model.learning_rate.eval(), loss) # 如果模型沒有得到提升，減小learning rate if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) # 保存模型 checkpoint_path = "chatbot_seq2seq.ckpt" model.saver.save(sess, checkpoint_path, global_step=model.global_step) loss = 0.0 # 使用測試數據評估模型 for bucket_id in range(len(buckets)): if len(test_set[bucket_id]) == 0: continue encoder_inputs, decoder_inputs, target_weights = model.get_batch(test_set, bucket_id) _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf') print(bucket_id, eval_ppx) ~~~ 4對話機器人 -- 4tfbot.py ![](http://blog.topspeedsnail.com/wp-content/uploads/2016/12/%E5%B1%8F%E5%B9%95%E5%BF%AB%E7%85%A7-2016-12-06-%E4%B8%8B%E5%8D%8812.13.08.png) ~~~ import tensorflow as tf # 0.12 from tensorflow.models.rnn.translate import seq2seq_model import os import numpy as np PAD_ID = 0 GO_ID = 1 EOS_ID = 2 UNK_ID = 3 train_encode_vocabulary = 'train_encode_vocabulary' train_decode_vocabulary = 'train_decode_vocabulary' def read_vocabulary(input_file): tmp_vocab = [] with open(input_file, "r") as f: tmp_vocab.extend(f.readlines()) tmp_vocab = [line.strip() for line in tmp_vocab] vocab = dict([(x, y) for (y, x) in enumerate(tmp_vocab)]) return vocab, tmp_vocab vocab_en, _, = read_vocabulary(train_encode_vocabulary) _, vocab_de, = read_vocabulary(train_decode_vocabulary) # 詞匯表大小5000 vocabulary_encode_size = 5000 vocabulary_decode_size = 5000 buckets = [(5, 10), (10, 15), (20, 25), (40, 50)] layer_size = 256 # 每層大小 num_layers = 3 # 層數 batch_size = 1 model = seq2seq_model.Seq2SeqModel(source_vocab_size=vocabulary_encode_size, target_vocab_size=vocabulary_decode_size, buckets=buckets, size=layer_size, num_layers=num_layers, max_gradient_norm= 5.0, batch_size=batch_size, learning_rate=0.5, learning_rate_decay_factor=0.99, forward_only=True) model.batch_size = 1 with tf.Session() as sess: # 恢復前一次訓練 ckpt = tf.train.get_checkpoint_state('.') if ckpt != None: print(ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print("沒找到模型") while True: input_string = input('me > ') # 退出 if input_string == 'quit': exit() input_string_vec = [] for words in input_string.strip(): input_string_vec.append(vocab_en.get(words, UNK_ID)) bucket_id = min([b for b in range(len(buckets)) if buckets[b][0] > len(input_string_vec)]) encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(input_string_vec, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if EOS_ID in outputs: outputs = outputs[:outputs.index(EOS_ID)] response = "".join([tf.compat.as_str(vocab_de[output]) for output in outputs]) print('AI > ' + response) ~~~