言語学B追加情報

最終更新日：2021/10/11 08:51

# 言語学B追加情報 ## 文章生成以下が解析に使用したソースコード `mecab` のインストールと同じディレクトリに形態素群として使用する `bocchan.txt` が必要 ```python import os import subprocess import random END_OF_SENTENCE = "。" def cmd(command): cmd = command process = (subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0]).decode('utf-8') return process def words(filename): return cmd("cat " + filename + " | mecab | awk '{print $1}'").split() def next_word(current_words_v, words_v, probability_v): probability_index = 0 next_word_list = {} # 次の単語の検索 for word in words_v: if probability_index > 0 and not word in ["、", "「", "」", "EOS"]: if word in next_word_list: next_word_list[word] += probability_index next_word_list[word] = probability_index else: next_word_list[word] = probability_index probability_index = 0 for current_word, probability in zip(reversed(current_words_v), probability_v): if word == current_word: probability_index = probability break # 全体確率確率を計算 overall_probability = 0 for next_word_probability in next_word_list.values(): overall_probability += next_word_probability # 次の単語を選定し返す sum_probability = 0 next_word_value = random.uniform(0, overall_probability) for next_word, next_word_probability in next_word_list.items(): sum_probability += next_word_probability if next_word_value <= sum_probability: return next_word print("Error") exit() def gen_sentence_v(textfile, ngram_v=[1], limit_word_count_under=0, limit_word_count_up=256): words_list = words(textfile) is_gen_success = False while not is_gen_success: while True: initial_word_value = random.randint(0, len(words_list)-1) initial_word_class = cmd( "echo " + words_list[initial_word_value] + " | mecab | awk '{print $2}'").split(",")[0] if initial_word_class == "名詞" and not words_list[initial_word_value] == "EOS": break sentence_v = [] sentence_v.append(words_list[initial_word_value]) index = 1 while True: append_word = next_word(sentence_v[-4:], words_list, ngram_v) sentence_v.append(append_word) if append_word == "。": if index < limit_word_count_under: is_gen_success = False break else: is_gen_success = True break if index == limit_word_count_up: is_gen_success = False break index += 1 return sentence_v print("1") for i in range(20): sentence_v = gen_sentence_v("bocchan.txt", [1], 10, 20) sentence = "" for word in sentence_v: sentence += word print(sentence) print("1,10") for i in range(20): sentence_v = gen_sentence_v("bocchan.txt", [1, 10], 10, 20) sentence = "" for word in sentence_v: sentence += word print(sentence) print("1,10,100") for i in range(20): sentence_v = gen_sentence_v("bocchan.txt", [1, 10, 100], 10, 20) sentence = "" for word in sentence_v: sentence += word print(sentence) print("1,10,100,1000") for i in range(20): sentence_v = gen_sentence_v("bocchan.txt", [1, 10, 100, 1000], 10, 20) sentence = "" for word in sentence_v: sentence += word print(sentence) print("1,5,25,125") for i in range(20): sentence_v = gen_sentence_v("bocchan.txt", [1, 5, 25, 125], 10, 20) sentence = "" for word in sentence_v: sentence += word print(sentence) print("1,2,4,8") for i in range(20): sentence_v = gen_sentence_v("bocchan.txt", [1, 2, 4, 8], 10, 20) sentence = "" for word in sentence_v: sentence += word print(sentence) ``` ## 集計以下が集計に利用したソースコード上の文章生成で帰ってきた文字列ファイルを加工した `result.csv` とGoogleフォームで集計したものの結果をCSVとして吐き出したもの `集計.csv` が同一ディレクトリに必要 ```python import csv with open("集計.csv") as file, open("result.csv") as data: file_sum = [] file_text = csv.reader(file) data_text = csv.reader(data) file_table = [row for row in file_text] data_table = [row for row in data_text] for data_class in data_table: item_sum = 0 for data_item in data_class: for a in file_table: for b in a: if b == data_item: item_sum += 1 file_sum.append(item_sum) print(file_sum) ```

≡