最終更新日:2021/10/11 08:51
# 言語学B追加情報
## 文章生成
以下が解析に使用したソースコード
`mecab` のインストールと同じディレクトリに形態素群として使用する `bocchan.txt` が必要
```python
import os
import subprocess
import random
END_OF_SENTENCE = "。"
def cmd(command):
cmd = command
process = (subprocess.Popen(cmd, stdout=subprocess.PIPE,
shell=True).communicate()[0]).decode('utf-8')
return process
def words(filename):
return cmd("cat " + filename + " | mecab | awk '{print $1}'").split()
def next_word(current_words_v, words_v, probability_v):
probability_index = 0
next_word_list = {}
# 次の単語の検索
for word in words_v:
if probability_index > 0 and not word in ["、", "「", "」", "EOS"]:
if word in next_word_list:
next_word_list[word] += probability_index
next_word_list[word] = probability_index
else:
next_word_list[word] = probability_index
probability_index = 0
for current_word, probability in zip(reversed(current_words_v), probability_v):
if word == current_word:
probability_index = probability
break
# 全体確率確率を計算
overall_probability = 0
for next_word_probability in next_word_list.values():
overall_probability += next_word_probability
# 次の単語を選定し返す
sum_probability = 0
next_word_value = random.uniform(0, overall_probability)
for next_word, next_word_probability in next_word_list.items():
sum_probability += next_word_probability
if next_word_value <= sum_probability:
return next_word
print("Error")
exit()
def gen_sentence_v(textfile, ngram_v=[1], limit_word_count_under=0, limit_word_count_up=256):
words_list = words(textfile)
is_gen_success = False
while not is_gen_success:
while True:
initial_word_value = random.randint(0, len(words_list)-1)
initial_word_class = cmd(
"echo " + words_list[initial_word_value] + " | mecab | awk '{print $2}'").split(",")[0]
if initial_word_class == "名詞" and not words_list[initial_word_value] == "EOS":
break
sentence_v = []
sentence_v.append(words_list[initial_word_value])
index = 1
while True:
append_word = next_word(sentence_v[-4:], words_list, ngram_v)
sentence_v.append(append_word)
if append_word == "。":
if index < limit_word_count_under:
is_gen_success = False
break
else:
is_gen_success = True
break
if index == limit_word_count_up:
is_gen_success = False
break
index += 1
return sentence_v
print("1")
for i in range(20):
sentence_v = gen_sentence_v("bocchan.txt", [1], 10, 20)
sentence = ""
for word in sentence_v:
sentence += word
print(sentence)
print("1,10")
for i in range(20):
sentence_v = gen_sentence_v("bocchan.txt", [1, 10], 10, 20)
sentence = ""
for word in sentence_v:
sentence += word
print(sentence)
print("1,10,100")
for i in range(20):
sentence_v = gen_sentence_v("bocchan.txt", [1, 10, 100], 10, 20)
sentence = ""
for word in sentence_v:
sentence += word
print(sentence)
print("1,10,100,1000")
for i in range(20):
sentence_v = gen_sentence_v("bocchan.txt", [1, 10, 100, 1000], 10, 20)
sentence = ""
for word in sentence_v:
sentence += word
print(sentence)
print("1,5,25,125")
for i in range(20):
sentence_v = gen_sentence_v("bocchan.txt", [1, 5, 25, 125], 10, 20)
sentence = ""
for word in sentence_v:
sentence += word
print(sentence)
print("1,2,4,8")
for i in range(20):
sentence_v = gen_sentence_v("bocchan.txt", [1, 2, 4, 8], 10, 20)
sentence = ""
for word in sentence_v:
sentence += word
print(sentence)
```
## 集計
以下が集計に利用したソースコード
上の文章生成で帰ってきた文字列ファイルを加工した `result.csv` とGoogleフォームで集計したものの結果をCSVとして吐き出したもの `集計.csv` が同一ディレクトリに必要
```python
import csv
with open("集計.csv") as file, open("result.csv") as data:
file_sum = []
file_text = csv.reader(file)
data_text = csv.reader(data)
file_table = [row for row in file_text]
data_table = [row for row in data_text]
for data_class in data_table:
item_sum = 0
for data_item in data_class:
for a in file_table:
for b in a:
if b == data_item:
item_sum += 1
file_sum.append(item_sum)
print(file_sum)
```