自己动手写CNN Inference框架之 (三) dense

之前我们介绍过CNN inference框架的基本结构，如何从tensorflow的graph中提取conv2d的权重，随后利用该权重进行对应的卷积操作。本文我们继续给大家介绍CNN中的另一个重要的Op --- 全连接层（Dense layers）。
首发：https://zhuanlan.zhihu.com/p/72897272
作者：张新栋

细心的读者可能会发现，其实conv2d也是一个特殊的dense操作。假设我们不考虑性能上的问题，一个conv2d操作可以分解成重排数据+矩阵相乘的两个步骤进行实现。所以caffe代码中在进行convolution的操作的时候，会先进行im2col进行数据的重排，最后用gemm进行矩阵的相乘，随后再重排数据以得到输出的结果。不同实现方式间各有利弊，后面有机会我会跟大家一起进行讨论。

几乎CNN中的所有Op都可以用gemm即矩阵相乘的形式进行解释，可见矩阵相乘的重要性。本文中要讲的dense Op实质上就是矩阵相乘，即y = Ax + b。我们在训练的时候，注意力会放在学习参数A和bias b上面。在进行inference的时候，对于上层的输入x，我们直接进行矩阵相乘（bias为可选），就可以得到dense的输出结果。另外需要提醒大家的是，tensorflow中实现采用的数据格式默认为NHWC，本文实现考虑NCHW。下面我们看看如何用C语言进行dense的操作，

Dense操作的实现

我们先来看如下的代码，对应矩阵操作y = Ax + b。input即为上式中的x，filter为矩阵A，y为最后的输出结果output。为了设计上的简单方便，我们这里考虑A为一个二维矩阵，即对应NCHW的格式，filter应为11HW的数据格式；另外输入input也要求为一个一维的向量，对应NCHW的格式，input为11H1的格式。矩阵乘法，我们知道对应第output[i, 0]的元素，为第i行的fitler向量和第1列的input向量的点乘结果。下面代码中我们一次取出4行的filter，目的是为了减少for循环的次数，并对嵌套的循环进行unroll。后面的专栏文章，我们还会跟大家介绍如何使用halide进行如上操作。

void Dense(Tensor* input, Tensor* output, Tensor* filter)
{
    assert(input  != NULL);
    assert(getBatch(input) ==1);
    assert(getChannel(input) == 1);
    assert(getWidth(input) == 1);

    assert(output != NULL);
    assert(getBatch(output) == 1);
    assert(getChannel(output) == 1);
    assert(getWidth(output) == 1);

    assert(filter != NULL);
    assert(getBatch(filter) == 1);
    assert(getChannel(filter) == 1);
    assert(getHeight(filter) == getHeight(output));
    assert(getWidth(filter) == getHeight(input));

    int H = getHeight(filter);
    int W = getWidth(filter);

    int h, w;
    int h_iters  = floor(H / 4);
    int h_remains = H % 4;

    // unroll
    for (h = 0; h < h_iters; ++h)
    {

        int idx0 = (h * 4 + 0) * W;
        int idx1 = (h * 4 + 1) * W;
        int idx2 = (h * 4 + 2) * W;
        int idx3 = (h * 4 + 3) * W;

        float* f0 = filter->data + idx0;
        float* f1 = filter->data + idx1;
        float* f2 = filter->data + idx2;
        float* f3 = filter->data + idx3;

        float* o0 = output->data + h + 0;
        float* o1 = output->data + h + 1;
        float* o2 = output->data + h + 2;
        float* o3 = output->data + h + 3;

        float* ii = input->data;
        for (w = 0; w < W; ++w)
        {
            *o0 += ((*f0) * (*ii));
            *o1 += ((*f1) * (*ii));
            *o2 += ((*f2) * (*ii));
            *o3 += ((*f3) * (*ii));

            f0 ++;
            f1 ++;
            f2 ++;
            f3 ++;
            ii ++;
        }
    }
    // remain
    for (h = 0; h < h_remains; ++h)
    {
        int idx  = (H - h - 1) * W;
        float* f = filter->data + idx;
        float* o = output->data + H - h - 1;
        float* i = input->data;
        for (w = 0; w < W; ++w)
        {
            *o += ((*f) * (*i));

            f ++;
            i ++;
        }
    }
}

解析tensorflow的dense参数

同上一篇文章一样，我们进行了dense的开发后。第二步希望能提取tensorflow中对应op的参数，对其数据格式后进行数值验证。需要再强调的是，我们采用的数据排布格式为NCHW，这一点与tensorflow的默认输入格式不同。下面是提取dense参数的python脚本，

#coding=utf-8
import tensorflow as tf
import numpy as np


a  = np.arange(25)
a  = np.reshape(a, (1,1,1,25))

## build graph
input  = tf.placeholder(tf.float32,shape=(1,1,1,25),name="input")
logits = tf.layers.dense(inputs=input, units=3, activation=None, use_bias=False)

## parser parameter
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    tvars = tf.trainable_variables()
    tvars_vals = sess.run(tvars)
    graph_val = tf.get_default_graph()
    graph_def = tf.get_default_graph().as_graph_def(add_shapes=True)
    result = sess.run(logits, feed_dict = {input: a})
    result = np.transpose(result, (0, 3, 1, 2))
    print result


    values_file = open("./models/2_values.bat", "w")
    config_file = open("./models/2_config.bat", "w")

    weights = graph_val.get_tensor_by_name('dense/kernel:0').eval()
    weights = np.transpose(weights, (1, 0))
    print weights.shape

    shapes  = list(weights.shape)
    weights = weights.flatten()
    N = len(weights)

    config_file.write("DENSE\n");
    config_file.write(str(shapes[0]) + "\n")
    config_file.write(str(shapes[1]) + "\n")

    for i in range(0, N):
        if i != N-1:
            values_file.write(str(weights[i]) + "\n")
        else:
            values_file.write(str(weights[i]))
    values_file.close()

    for n in graph_def.node:
        if n.name == "conv2d/Conv2D":
            attrs   = n.attr
            padding = attrs['padding'].s
            strides = np.array(attrs['strides'].list.i, dtype=np.uint8)
            format  = attrs['data_format'].s
            config_file.write(format     + "\n")

基于C的模型解析和测试

保存好的参数如下所示，values因为参数值比较多，我们这里省略（values的保存一行为一个值）；我们看看config的参数，

DENSE
3
25

第一个参数为Op的类型，第二个参数为filter的Height（output的输出元素个数），第三个为filter的Width（input的输入元素个数）。当我们判断输入的Op类型为dense时，我们按照如下约定进行模型解析：1. 读取第二行到第三行的两个个参数，分别为HW，矩阵的高和宽；2. 基于初始好的tensor形状，读取values文件中的参数，注意数据排布为NCHW。可参考如下代码：

void extractOpConfig(OpConfig* opConfig, const char* filename)
{
    assert(opConfig != NULL);
    FILE* file = fopen(filename, "r");

    char type[20];
    fgets(type, sizeof(type), file);

    if (strstr(type, "CONV2D")  != NULL) {

        opConfig->type = CONV2D;

        char N_str[20];
        fgets(N_str, sizeof(N_str), file);
        int N = atoi(N_str);

        char C_str[20];
        fgets(C_str, sizeof(C_str), file);
        int C = atoi(C_str);

        char H_str[20];
        fgets(H_str, sizeof(H_str), file);
        int H = atoi(H_str);

        char W_str[20];
        fgets(W_str, sizeof(W_str), file);
        int W = atoi(W_str);

        char padding[20];
        int  pad_w;
        int  pad_h;
        fgets(padding, sizeof(padding), file);
        if(strstr(padding, "SAME") != NULL) {
            pad_w = 0;
            pad_h = 0;
        } else {
            pad_w = 0;
            pad_h = 0;
        }

        char string_h_str[20];
        fgets(string_h_str, sizeof(string_h_str), file);
        int stride_h = atoi(string_h_str);

        char string_w_str[20];
        fgets(string_w_str, sizeof(string_w_str), file);
        int stride_w = atoi(string_w_str);

        char format[30];
        fgets(format, sizeof(format), file);

        opConfig->type     = CONV2D;
        opConfig->dims[0]  = N;
        opConfig->dims[1]  = C;
        opConfig->dims[2]  = H;
        opConfig->dims[3]  = W;
        opConfig->pad_w    = 0;
        opConfig->pad_h    = 0;
        opConfig->stride_w = stride_w;
        opConfig->stride_h = stride_h;

    }
    if (strstr(type, "SIGMOID") != NULL) {
        opConfig->type = SIGMOID;
    }
    if (strstr(type, "DENSE")   != NULL) {

        char H_str[20];
        fgets(H_str, sizeof(H_str), file);
        int H = atoi(H_str);

        char W_str[20];
        fgets(W_str, sizeof(W_str), file);
        int W = atoi(W_str);

        opConfig->type     = DENSE;
        opConfig->dims[0]  = 1;
        opConfig->dims[1]  = 1;
        opConfig->dims[2]  = H;
        opConfig->dims[3]  = W;
    }

    fclose(file);
}

最后我们逐行的读取dense的权值，

void extractOpWeights(Tensor* tensor, const char* filename)
{
    assert(tensor != NULL);
    FILE* file = fopen(filename, "r");

    char line[50];
    float* data = tensor->data;
    while (fgets(line, sizeof(line), file))
    {
        float val = atof(line);
        *data = val;
        data ++;
    }
    fclose(file);
}

下面我们需要写业务代码进行模型测试，逻辑很简单，首先解析模型的config文件，然后根据config的参数进行values的初始化、dense的操作，最后跟python的输出结果进行精确性的对比。

void test_dense()
{
    // conv2d operation
    Tensor* input  = NULL;
    Tensor* kernel = NULL;
    Tensor* output = NULL;
    OpConfig* config = (OpConfig*) malloc(sizeof(OpConfig));

    extractOpConfig (config, "./models/2_config.bat");

    //init kernel
    int kernel_n = 1;
    int kernel_c = 1;
    int kernel_h = config->dims[2];
    int kernel_w = config->dims[3];

    initTensor(kernel, config->dims, 0.0f);
    extractOpWeights(kernel, "./models/2_values.bat");

    //init input
    int input_h = kernel_w;
    int input_dims[4] = {1,1,input_h,1};
    initTensor(input, input_dims, 0.0f);
    for (int i = 0; i < input->nums; ++i)
        input->data[i] = i;

    //init output
    int output_dims[4] = {1,1,kernel_h,1};
    initTensor(output, output_dims, 0.0f);


    // Conv2d
    clock_t start = clock();
    Dense(input, output, kernel);
    printTensor(output);
    clock_t end   = clock();
    float duration = float(end - start) / CLOCKS_PER_SEC;

    // free Tensor
    freeTensor(input);
    freeTensor(kernel);
    freeTensor(output);
    free(config);
}

最后

本文我们对应tensorflow中的二维卷积操作实现了简单的dense操作，然后用python脚本解析tensorflow的graph模型中的dense参数，最后保存成自定义的文件格式；最后按照自定义的文件格式，读取和解析Op及Op对应参数，在自己的框架中实现dense的数值计算，跟tensorflow的数值计算结果一致，后续我们也会将本教程代码的github链接同步上来。

欢迎大家留言讨论、关注专栏，谢谢大家！

推荐阅读

专注嵌入式端的AI算法实现，欢迎关注作者微信公众号和知乎嵌入式AI算法实现专栏。

更多嵌入式AI相关的技术文章请关注极术嵌入式AI专栏

Dense操作的实现

解析tensorflow的dense参数

基于C的模型解析和测试

最后

推荐阅读

目录