add util.py

zhuwei12 · zhuwei12 · commit bca2cfae05ce · 2020-04-03T13:55:08.000+08:00
diff --git a/cv_actions/Homework/homework2/util.py b/cv_actions/Homework/homework2/util.py
@@ -0,0 +1,244 @@
+from pickle import load
+from numpy import array
+from keras.preprocessing.sequence import pad_sequences
+from keras.utils import to_categorical
+
+def load_doc(filename):
+    """读取文本文件为string
+    Args:
+        filename: 文本文件
+    Returns:
+        string, 文本文件的内容
+    """
+    # open the file as read only
+    file = open(filename, 'r')
+    # read all text
+    text = file.read()
+    # close the file
+    file.close()
+    return text
+
+
+def to_list(captions):
+    """将一个字典(key为文件名, value为图像标题list)转换为图像标题list
+
+    Args:
+        captions: 一个字典, key为文件名, value为图像标题list
+
+    Returns:
+        图像标题list
+
+    """
+    all_desc = list()
+    # 遍历每幅图片
+    for key in captions.keys():
+        [all_desc.append(d) for d in captions[key]]
+    return all_desc
+
+
+def get_max_length(captions):
+    """从标题字典计算图像标题里面最长的标题的长度
+    Args:
+        captions: 一个dict, key为文件名(不带.jpg后缀), value为图像标题list
+    Returns:
+        最长标题的长度
+    """
+    lines = to_list(captions)
+    return max(len(d.split()) for d in lines)
+
+
+def load_set(filename):
+    """从文本文件加载图像名set
+    Args:
+        filename: 文本文件,每一行都包含一个图像文件名（包含.jpg文件后缀）
+    Returns:get_max_length
+        set, 文件名，去除了.jpg后缀
+    """
+
+    doc = load_doc(filename)
+    dataset = list()
+    # process line by line
+    for line in doc.split('\n'):
+        # skip empty lines
+        if len(line) < 1:
+            continue
+        # get the image identifier
+        identifier = line.split('.')[0]
+        dataset.append(identifier)
+    return set(dataset)
+
+def load_image_names(filename):
+    """
+    从文本文件加载图像名set
+    :param filename:文本文件，每一行都包含一个图像文件名(包含.jpg文件后缀)
+    :return:set,文件名，去了.jpg后缀
+    """
+    doc = load_doc(filename)
+    data_set = list()
+    # process line by line
+    for line in doc.split('\n'):
+        # skip empty lines
+        if len(line) < 1:
+            continue
+        # get the image identifier
+        identifier = line.split('.')[0]
+        data_set.append(identifier)
+    return set(data_set)
+
+
+def load_clean_captions(filename, dataset):
+    """为图像标题首尾分别加上'startseq ' 和 ' endseq', 作为自动标题生成的起始和终止
+    Args:
+        filename: 文本文件,每一行由图像名,和图像标题构成, 图像的标题已经进行了清洗
+        dataset: 图像名list
+    Returns:
+        dict, key为图像名, value为添加了＇startseq'和＇endseq'的标题list
+    """
+    # load document
+    doc = load_doc(filename)
+    descriptions = dict()
+    for line in doc.split('\n'):
+        # split line by white space
+        tokens = line.split()
+        # split id from description
+        image_id, image_desc = tokens[0], tokens[1:]
+        # skip images not in the set
+        if image_id in dataset:
+            # create list
+            if image_id not in descriptions:
+                descriptions[image_id] = list()
+            # wrap description in tokens
+            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
+            # store
+            descriptions[image_id].append(desc)
+    return descriptions
+
+
+def load_photo_features(filename, dataset):
+    """从图像特征文件中加载给定图像名list对应的图像特征
+
+    Args:
+        filename: 包含图像特征的文件名, 文件加载以后是一个字典,
+                    key为'Flicker8k_Dataset/' + 文件名,
+                    value为文件名对应的图表的特征
+        dataset: 图像文件名list
+
+    Returns:
+        图像特征字典, key为文件名,
+                    value为文件名对应的图表的特征
+
+    """
+    # load all features
+    all_features = load(open(filename, 'rb'))
+    # filter features
+    features = {k: all_features[k] for k in dataset}
+    return features
+
+#根据数据训练模型
+#读取一组图像id
+def load_ids(fn):
+    doc = load_doc(fn)
+    ret = list()
+    for line in doc.split('\n'):
+        if len(line) < 1:
+            continue
+        id = line.split('.')[0]
+        ret.append(id)
+    return set(ret)
+
+
+def create_sequences(tokenizer, max_length, descriptions, photos_features, vocab_size):
+    """
+    从输入的图片标题list和图片特征构造LSTM的一组输入
+
+    Args:
+    :param tokenizer: 英文单词和整数转换的工具keras.preprocessing.text.Tokenizer
+    :param max_length: 训练数据集中最长的标题的长度
+    :param descriptions: dict, key 为图像的名(不带.jpg后缀), value 为list, 包含一个图像的几个不同的描述
+    :param photos_features:  dict, key 为图像的名(不带.jpg后缀), value 为numpy array 图像的特征
+    :param vocab_size: 训练集中表的单词数量
+    :return: tuple:
+            第一个元素为 numpy array, 元素为图像的特征, 它本身也是 numpy.array
+            第二个元素为 numpy array, 元素为图像标题的前缀, 它自身也是 numpy.array
+            第三个元素为 numpy array, 元素为图像标题的下一个单词(根据图像特征和标题的前缀产生) 也为numpy.array
+
+    Examples:
+        from pickle import load
+        tokenizer = load(open('tokenizer.pkl', 'rb'))
+        max_length = 6
+        descriptions = {'1235345':['startseq one bird on tree endseq', "startseq red bird on tree endseq"],
+                        '1234546':['startseq one boy play water endseq', "startseq one boy run across water endseq"]}
+        photo_features = {'1235345':[ 0.434,  0.534,  0.212,  0.98 ],
+                          '1234546':[ 0.534,  0.634,  0.712,  0.28 ]}
+        vocab_size = 7378
+        print(create_sequences(tokenizer, max_length, descriptions, photo_features, vocab_size))
+(array([[ 0.434,  0.534,  0.212,  0.98 ],
+       [ 0.434,  0.534,  0.212,  0.98 ],
+       [ 0.434,  0.534,  0.212,  0.98 ],
+       [ 0.434,  0.534,  0.212,  0.98 ],
+       [ 0.434,  0.534,  0.212,  0.98 ],
+       [ 0.434,  0.534,  0.212,  0.98 ],
+       [ 0.434,  0.534,  0.212,  0.98 ],
+       [ 0.434,  0.534,  0.212,  0.98 ],
+       [ 0.434,  0.534,  0.212,  0.98 ],
+       [ 0.434,  0.534,  0.212,  0.98 ],
+       [ 0.534,  0.634,  0.712,  0.28 ],
+       [ 0.534,  0.634,  0.712,  0.28 ],
+       [ 0.534,  0.634,  0.712,  0.28 ],
+       [ 0.534,  0.634,  0.712,  0.28 ],
+       [ 0.534,  0.634,  0.712,  0.28 ],
+       [ 0.534,  0.634,  0.712,  0.28 ],
+       [ 0.534,  0.634,  0.712,  0.28 ],
+       [ 0.534,  0.634,  0.712,  0.28 ],
+       [ 0.534,  0.634,  0.712,  0.28 ],
+       [ 0.534,  0.634,  0.712,  0.28 ],
+       [ 0.534,  0.634,  0.712,  0.28 ]]),
+array([[  0,   0,   0,   0,   0,   2],
+       [  0,   0,   0,   0,   2,  59],
+       [  0,   0,   0,   2,  59, 254],
+       [  0,   0,   2,  59, 254,   6],
+       [  0,   2,  59, 254,   6, 134],
+       [  0,   0,   0,   0,   0,   2],
+       [  0,   0,   0,   0,   2,  26],
+       [  0,   0,   0,   2,  26, 254],
+       [  0,   0,   2,  26, 254,   6],
+       [  0,   2,  26, 254,   6, 134],
+       [  0,   0,   0,   0,   0,   2],
+       [  0,   0,   0,   0,   2,  59],
+       [  0,   0,   0,   2,  59,  16],
+       [  0,   0,   2,  59,  16,  82],
+       [  0,   2,  59,  16,  82,  24],
+       [  0,   0,   0,   0,   0,   2],
+       [  0,   0,   0,   0,   2,  59],
+       [  0,   0,   0,   2,  59,  16],
+       [  0,   0,   2,  59,  16, 165],
+       [  0,   2,  59,  16, 165, 127],
+       [  2,  59,  16, 165, 127,  24]]),
+array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
+       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
+       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
+       ...,
+       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
+       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
+       [ 0.,  0.,  0., ...,  0.,  0.,  0.]]))
+    """
+    X1, X2, y = list(), list(), list()
+    for key, desc_list in descriptions.items():
+        for desc in desc_list:
+            seq = tokenizer.texts_to_sequences([desc])[0]
+            for i in range(1, len(seq)):
+                in_seq, out_seq = seq[:i], seq[i]
+                #填充in_seq,使得其长度为max_length
+                in_seq = pad_sequences([in_seq], maxlen = max_length)[0]
+                out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
+                X1.append(photos_features[key][0])
+                X2.append(in_seq)
+                y.append(out_seq)
+    return array(X1), array(X2), array(y)
+
+if __name__ == '__main__':
+    path = 'D:/learningDoc/CVSource/CVSource/cv_action_codes/Homework/homework2/'
+    train_image_names = load_image_names(path + 'task4/Flickr_8k.trainImages.txt')
+    print("train_image_names=%d"%len(train_image_names))
+    descriptions = load_clean_captions(path + 'task5/descriptions.txt', train_image_names)
+    print(descriptions)