C++元编程——CNN进行Minist手写数字识别

腾昵猫

已于 2023-06-01 19:57:59 修改

阅读量1k

点赞数 1

CC 4.0 BY-SA版权

分类专栏：元编程学习实践文章标签： cnn 人工智能神经网络

于 2023-04-10 09:46:35 首次发布

本文链接：https://blog.csdn.net/Dr_Jack/article/details/130052706

元编程学习实践专栏收录该内容

23 篇文章

订阅专栏

Minist数据来源：

MNIST handwritten digit database, Yann LeCun, Corinna Cortes and Chris Burges

数据的格式如下：

CNN结构，这个是理想结构，但是由于元编程速度太慢，所以就将全连接层改成很少的输出：

卷积层实现（convolution_layer）

这里的卷积层包含了卷积-加权-池化。正向传播时候的处理如下图所示：

反向传播时候涉及到参数更新和误差后传。这两个在卷积-加权都是不一样的。其中加权层比较简单，就是用梯度下降进行处理就行。卷积层的反向传播使用转置卷积，参数更新使用的是输入参数使用上层误差进行卷积运算的结果正则化后进行更新。（具体后续有时间再补充）

代码如下：

#ifndef _CONVOLUTION_LAYER_HPP_
#define _CONVOLUTION_LAYER_HPP_
#include "mat.hpp"
#include "weight_initilizer.hpp"
#include "base_function.hpp"

/* 卷积层 */
template<int input_row, int input_col, int tpl_row, int tpl_col, int row_step, int col_step, template<typename> class update_method_templ, template<typename> class activate_func, typename tpl_init_method, typename pad_t, typename val_t = double>
struct conv_layer 
{
	using tpl_type = mat<tpl_row, tpl_col, val_t>;
	using input_type = mat<input_row, input_col, val_t>;
	using pad_type = mat<input_row + get_pad_size<pad_t>(input_row, tpl_row, row_step)
		, input_col + get_pad_size<pad_t>(input_col, tpl_col, col_step), val_t>;
	using pad_size = pad_size_t<input_row, input_col, tpl_row, tpl_col, row_step, col_step, pad_t>;
	using ret_type = decltype(inner_conv<row_step, col_step>(input_type().pad<pad_size::top, pad_size::left, pad_size::right, pad_size::bottom>(), tpl_type()));

	tpl_type mt_tpl;
	update_method_templ<mat<tpl_row, tpl_col, val_t>>	um_tpl;
	pad_type mt_input;
	ret_type mt_bias;
	update_method_templ<ret_type>	um_bias;

	activate_func<ret_type>	act_func;

	conv_layer()
	{
		weight_initilizer<tpl_init_method>::cal(mt_tpl);
	}

	inline ret_type forward(const input_type& mt)
	{
		mt_input = mt.pad<pad_size::top, pad_size::left, pad_size::right, pad_size::bottom>();
		ret_type mt1 = inner_conv<row_step, col_step>(mt_input, mt_tpl);
		return act_func.forward( mt1 + mt_bias);
	}

	inline input_type backward(const ret_type& mt_delta) 
	{
		auto mt_delta_deact = act_func.backward() * mt_delta;
		auto mt_delta_span = mt_delta_deact.span<row_step - 1, col_step - 1>();			// 采用了步长运算，等于有一些没计算，所以反向传播时候的贡献是0
		using ret_pad_type = decltype(mt_delta_span);
		/* 计算反向传播误差 */
		/* 计算返回阵需要pad的大小 */
		constexpr int target_r = tpl_row + pad_type::r - 1;
		constexpr int target_c = tpl_col + pad_type::c - 1;
		constexpr int pad_top = (target_r - ret_pad_type::r) / 2;
		constexpr int pad_left = (target_c - ret_pad_type::c) / 2;
		constexpr int pad_right = (target_c - ret_pad_type::c) - pad_left;
		constexpr int pad_bottom = (target_r - ret_pad_type::r) - pad_top;
		auto mt_delta_span_pad = mt_delta_span.pad<pad_top, pad_left, pad_right, pad_bottom>();
		auto mt_tpl_rot = mt_tpl.rot180();
		auto mt_ret_pad = inner_conv<1, 1, target_r, target_c, tpl_row, tpl_col, val_t>(mt_delta_span_pad, mt_tpl_rot);
		input_type mt_ret;
		mt_ret.assign<-1 * pad_size::top, -1 * pad_size::left>(mt_ret_pad);			// 剪除外边
		/* 计算卷积核更新 */
		auto mt_update = inner_conv<1, 1, pad_type::r, pad_type::c, ret_pad_type::r, ret_pad_type::c, val_t>(mt_input, mt_delta_span);
		if (mt_update.max_abs() != 0)
			mt_update = mt_update / mt_update.max_abs();
		mt_tpl = um_tpl.update(mt_tpl, mt_update);
		mt_bias = mt_bias - um_bias.update(mt_delta_deact, mt_delta_deact);
		/* 将模板均值置0，最大波动范围为1 */
		double d_mean = mt_tpl.sum() / (tpl_row * tpl_col);
		mt_tpl = mt_tpl - d_mean;
		if (mt_tpl.max_abs() != 0)
			mt_tpl = mt_tpl / mt_tpl.max_abs();

		return mt_ret;
	}

	void update_inert() 
	{
		um_tpl.update_inert();
		um_bias.update_inert();
	}

	void print() 
	{
		printf("<template>\r\n");
		mt_tpl.print();
		printf("<bias>\r\n");
		mt_bias.print();
	}

	static void print_type() 
	{
		printf("conv_layer<%d, %d, %d, %d, %d, %d> ", input_row, input_col, tpl_row, tpl_col, row_step, col_step);
		input_type::print_type();
	}
};

#include "ht_memory.h"
template<int input_row, int input_col, int tpl_row, int tpl_col, int row_step, int col_step, template<typename> class update_method_templ, template<typename> class activate_func, typename tpl_init_method, typename pad_t, typename val_t>
void write_file(const conv_layer<input_row, input_col, tpl_row, tpl_col, row_step, col_step, update_method_templ, activate_func, tpl_init_method, pad_t, val_t>& lyr, ht_memory& mry)
{
	write_file(lyr.mt_tpl, mry);
	write_file(lyr.mt_bias, mry);
}
template<int input_row, int input_col, int tpl_row, int tpl_col, int row_step, int col_step, template<typename> class update_method_templ, template<typename> class activate_func, typename tpl_init_method, typename pad_t, typename val_t>
void read_file(ht_memory& mry, conv_layer<input_row, input_col, tpl_row, tpl_col, row_step, col_step, update_method_templ, activate_func, tpl_init_method, pad_t, val_t>& lyr)
{
	read_file(mry, lyr.mt_tpl);
	read_file(mry, lyr.mt_bias);
}

/* 
	多通道、多核的卷积层 
	輸入是這個樣子的：
	|	C1		C2		C3|
	输出是这个样子的：
					kern1	kern2	kern3
	channels1	|	C1K1	C1K2	C1K3|
	channels2	|	C2K1	C2K2	C2K3|
	channels3	|	C3K1	C3K2	C3K3|
	其中C是通道K是核，CK结果是一个卷积后的矩阵
*/
template<
	int channel_num, int tpl_num
	, int input_row, int input_col
	, int tpl_row, int tpl_col
	, int row_step, int col_step
	, template<typename> class update_method_templ
	, template<typename> class activate_func
	, typename tpl_init_method
	, typename pad_t
>
struct mul_channel_conv
{
	using conv_type = conv_layer<input_row, input_col
		, tpl_row, tpl_col
		, row_step, col_step
		, update_method_templ
		, activate_func
		, tpl_init_method
		, pad_t
		, double
	>;
	using input_type = mat<channel_num, 1, typename conv_type::input_type>;
	using ret_type = mat<channel_num, tpl_num, typename conv_type::ret_type>;

	conv_type tpls[tpl_num];

	ret_type forward(const input_type& mt)
	{
		ret_type ret;
		for (int i = 0; i < channel_num; ++i) 
		{
			for (int j = 0; j < tpl_num; ++j) 
			{
				ret.get(i, j) = tpls[j].forward(mt[i]);
			}
		}
		return ret;
	}

	/* 这里返回比较特殊，因为返回的维度比较高，所以只能误差相加 */
	input_type backward(const ret_type& delta) 
	{
		input_type ret;
		for (int i = 0; i < channel_num; ++i) 
		{
			for (int j = 0; j < tpl_num; ++j)
			{
				ret.get(i, 0) = ret.get(i, 0) + tpls[j].backward(delta.get(i, j)) / (double)tpl_num;
			}
		}
		return ret;
	}

	void update_inert() 
	{
		for (int i = 0; i < tpl_num; ++i) 
		{
			tpls[i].update_inert();
		}
	}
};

template<
	int channel_num, int tpl_num
	, int input_row, int input_col
	, int tpl_row, int tpl_col
	, int row_step, int col_step
	, template<typename> class update_method_templ
	, template<typename> class activate_func
	, typename tpl_init_method
	, typename pad_t
>
void write_file(const mul_channel_conv<
	channel_num, tpl_num
	, input_row, input_col
	, tpl_row, tpl_col
	, row_step, col_step
	, update_method_templ
	, activate_func
	, tpl_init_method
	, pad_t
	>& mcc, ht_memory& mry)
{
	for (int i = 0; i < tpl_num; ++i) 
	{
		write_file(mcc.tpls[i], mry);
	}
}

template<
	int channel_num, int tpl_num
	, int input_row, int input_col
	, int tpl_row, int tpl_col
	, int row_step, int col_step
	, template<typename> class update_method_templ
	, template<typename> class activate_func
	, typename tpl_init_method
	, typename pad_t
>
void read_file(ht_memory& mry, mul_channel_conv<
	channel_num, tpl_num
	, input_row, input_col
	, tpl_row, tpl_col
	, row_step, col_step
	, update_method_templ
	, activate_func
	, tpl_init_method
	, pad_t
>& mcc)
{
	for (int i = 0; i < tpl_num; ++i)
	{
		read_file(mry, mcc.tpls[i]);
	}
}

/* 多通道、多核的卷积层，最后将卷积结果进行加权，得到指定个数的结果 */
#include "bp.hpp"

/*
	多通道、多核的卷积层
	輸入是這個樣子的：
	|	C1		C2		C3|
	卷积层的输出是这个样子的：
					kern1	kern2	kern3
	channels1	|	C1K1	C1K2	C1K3|
	channels2	|	C2K1	C2K2	C2K3|
	channels3	|	C3K1	C3K2	C3K3|
	其中C是通道K是核，CK结果是一个卷积后的矩阵
	加权矩阵是这样的
	|	W1, W2, W3|
	加权层的输出是这样的
	|sum<arg.n={1-3}>(Wn*CnK1)|
	|sum<arg.n={1-3}>(Wn*CnK2)|
	|sum<arg.n={1-3}>(Wn*CnK3)|
	每行是对单一核不同通道的加权求和
*/
template<
	int channel_num, int tpl_num
	, int input_row, int input_col
	, int tpl_row, int tpl_col
	, int row_step, int col_step
	, template<typename> class update_method_templ
	, template<typename> class activate_func
	, typename tpl_init_method
	, typename pad_t
>
struct mul_channel_conv_with_weight
{
	using mul_conv_type = mul_channel_conv <
		channel_num, tpl_num
		, input_row, input_col
		, tpl_row, tpl_col
		, row_step, col_step
		, update_method_templ
		, activate_func
		, tpl_init_method
		, pad_t
	>;
	using mul_conv_ret_type = typename mul_conv_type::ret_type;
	using weight_type = bp<typename mul_conv_ret_type::type, mul_conv_ret_type::c
		, nadam, ReLu, HeGaussian
		, mul_conv_ret_type::r, 1
	>;
	using ret_type = typename weight_type::ret_type::t_type;
	using input_type = typename mul_conv_type::input_type;

	mul_conv_type mconv;
	weight_type weight;

	ret_type forward(const input_type& mt)
	{
		return weight.forward(mconv.forward(mt)).t();
	}

	input_type backward(const ret_type& delta)
	{
		return mconv.backward(weight.backward(delta.t()));
	}

	void update_inert() 
	{
		mconv.update_inert();
		weight.update_inert();
	}
};

template<
	int channel_num, int tpl_num
	, int input_row, int input_col
	, int tpl_row, int tpl_col
	, int row_step, int col_step
	, template<typename> class update_method_templ
	, template<typename> class activate_func
	, typename tpl_init_method
	, typename pad_t
>
void write_file(const mul_channel_conv_with_weight<
	channel_num, tpl_num
	, input_row, input_col
	, tpl_row, tpl_col
	, row_step, col_step
	, update_method_templ
	, activate_func
	, tpl_init_method
	, pad_t
>& mccw, ht_memory& mry)
{
	write_file(mccw.mconv, mry);
	write_file(mccw.weight, mry);
}

template<
	int channel_num, int tpl_num
	, int input_row, int input_col
	, int tpl_row, int tpl_col
	, int row_step, int col_step
	, template<typename> class update_method_templ
	, template<typename> class activate_func
	, typename tpl_init_method
	, typename pad_t
>
void read_file(ht_memory& mry, mul_channel_conv_with_weight<
	channel_num, tpl_num
	, input_row, input_col
	, tpl_row, tpl_col
	, row_step, col_step
	, update_method_templ
	, activate_func
	, tpl_init_method
	, pad_t
>& mccw)
{
	read_file(mry, mccw.mconv);
	read_file(mry, mccw.weight);
}

#endif

堆叠卷积层和池化层，最后增加全连接层和判别层，形成CNN（cnn）：

#ifndef _CNN_HPP_
#define _CNN_HPP_
#include "convolution_layer.hpp"
#include "pool_layer.hpp"
#include "bp.hpp"

/* 多核多通道加权卷积，再进行池化 */
template<
	int channel_num, int tpl_num
	, int input_row, int input_col
	, int tpl_row, int tpl_col
	, int row_step, int col_step
	, int pool_row, int pool_col
	, template<typename> class update_method_templ
	, template<typename> class activate_func
	, typename tpl_init_method
	, typename pad_t
>
struct mccw_with_pool
{
	using mccw_type = mul_channel_conv_with_weight<
		channel_num, tpl_num
		, input_row, input_col
		, tpl_row, tpl_col
		, row_step, col_step
		, update_method_templ
		, activate_func
		, tpl_init_method
		, pad_t
	>;
	using mccw_ret_type = typename mccw_type::ret_type::type;
	using pool_type = pool_layer<pool_layer_max, mccw_ret_type::r, mccw_ret_type::c, pool_row, pool_col, typename mccw_ret_type::type>;
	using ret_type = mat<tpl_num, 1, typename pool_type::ret_type>;
	using input_type = typename mccw_type::input_type;

	mccw_type mccw_layer;
	pool_type pool;

	ret_type forward(const input_type& mt)
	{
		ret_type ret;
		auto mccw_out = mccw_layer.forward(mt);
		for (int i = 0; i < tpl_num; ++i)
		{
			ret[i] = pool.forward(mccw_out[i]);
		}
		return ret;
	}

	input_type backward(const ret_type& delta)
	{
		using pool_out_type = typename mccw_type::ret_type;
		pool_out_type pool_out;
		for (int i = 0; i < tpl_num; ++i)
		{
			pool_out[i] = pool.backward(delta[i]);
		}
		return mccw_layer.backward(pool_out);
	}

	void update_inert() 
	{
		mccw_layer.update_inert();
	}
};

template<
	int channel_num, int tpl_num
	, int input_row, int input_col
	, int tpl_row, int tpl_col
	, int row_step, int col_step
	, int pool_row, int pool_col
	, template<typename> class update_method_templ
	, template<typename> class activate_func
	, typename tpl_init_method
	, typename pad_t
>
void write_file(const mccw_with_pool<
	channel_num, tpl_num
	, input_row, input_col
	, tpl_row, tpl_col
	, row_step, col_step
	, pool_row, pool_col
	, update_method_templ
	, activate_func
	, tpl_init_method
	, pad_t
>& mccwp, ht_memory& mry)
{
	write_file(mccwp.mccw_layer, mry);
}

template<
	int channel_num, int tpl_num
	, int input_row, int input_col
	, int tpl_row, int tpl_col
	, int row_step, int col_step
	, int pool_row, int pool_col
	, template<typename> class update_method_templ
	, template<typename> class activate_func
	, typename tpl_init_method
	, typename pad_t
>
void read_file(ht_memory& mry, mccw_with_pool<
	channel_num, tpl_num
	, input_row, input_col
	, tpl_row, tpl_col
	, row_step, col_step
	, pool_row, pool_col
	, update_method_templ
	, activate_func
	, tpl_init_method
	, pad_t
>& mccwp)
{
	read_file(mry, mccwp.mccw_layer);
}

/* 
	用于把多個“多通道卷積-加權-池化層”堆疊起來
	多通道情况下得出的是各个矩阵，将各个矩阵最后做一个卷积，然后将卷积结果拉直送给判别层
	int类型的参数必须是10*n
*/
template<int channel_num, int input_row, int input_col, typename pad_t>			// 能自动推倒的放在这里
struct mccwp_cluster_input
{
	/*
		每层的输出是这个样子的
						kern1	kern2	kern3
		channels1	|	C1K1	C1K2	C1K3|
		channels2	|	C2K1	C2K2	C2K3|
		channels3	|	C3K1	C3K2	C3K3|
	*/
	template<
		template<typename> class update_method_templ
		, template<typename> class activate_func
		, typename tpl_init_method
		, int tpl_num
		, int tpl_row, int tpl_col
		, int row_step, int col_step
		, int pool_row, int pool_col
		, int ...rest
	>
	struct mccwp_cluster
	{
		using cur_type = mccw_with_pool<
			channel_num, tpl_num
			, input_row, input_col
			, tpl_row, tpl_col
			, row_step, col_step
			, pool_row, pool_col
			, update_method_templ
			, activate_func
			, tpl_init_method
			, pad_t
		>;
		using next_input_type = typename cur_type::ret_type::type;

		/* 下层的通道数就是上层的输出 */
		using next_type = typename mccwp_cluster_input<tpl_num, next_input_type::r, next_input_type::c, pad_t>::template mccwp_cluster<
			update_method_templ
			, activate_func
			, tpl_init_method
			, rest...
		>;			// 下一个的输入类型是mat<tpl_num，1, mat<input_row',input_col', double> >

		using input_type = typename cur_type::input_type;
		using ret_type = typename next_type::ret_type;

		cur_type cur_mccwp;
		next_type next;

		ret_type forward(const input_type& mt_input)
		{
			return next.forward(cur_mccwp.forward(mt_input));
		}

		input_type backward(const ret_type& delta)
		{
			return cur_mccwp.backward(next.backward(delta));
		}

		void update_inert() 
		{
			cur_mccwp.update_inert();
			next.update_inert();
		}

		using stretch_type = unite_mat_t<ret_type>;

		static stretch_type stretch(const ret_type& ret)
		{
			stretch_type mtd;
			auto p = mtd.pval->p;
			for (int i = 0; i < ret.r; ++i)
			{
				memcpy(p, ret[i].pval->p, sizeof(*p) * ret[i].r *  ret[i].c);
				p += (ret[i].r *  ret[i].c);
			}
			return mtd;
		}

		static ret_type split(const stretch_type& st)
		{
			ret_type ret;
			auto p = st.pval->p;
			for (int i = 0; i < ret.r; ++i)
			{
				memcpy(ret[i].pval->p, p, sizeof(*p) * ret[i].r *  ret[i].c);
				p += (ret[i].r *  ret[i].c);
			}
			return ret;
		}
	};

	template<
		template<typename> class update_method_templ
		, template<typename> class activate_func
		, typename tpl_init_method
		, int tpl_num
		, int tpl_row, int tpl_col
		, int row_step, int col_step
		, int pool_row, int pool_col
	>
	struct mccwp_cluster<
	update_method_templ
	, activate_func
	, tpl_init_method
	, tpl_num
	, tpl_row, tpl_col
	, row_step, col_step
	, pool_row, pool_col
	>
	{
		using cur_type = mccw_with_pool<
			channel_num, tpl_num
			, input_row, input_col
			, tpl_row, tpl_col
			, row_step, col_step
			, pool_row, pool_col
			, update_method_templ
			, activate_func
			, tpl_init_method
			, pad_t
		>;

		using input_type = typename cur_type::input_type;
		using ret_type = typename cur_type::ret_type;

		cur_type cur_mccwp;

		ret_type forward(const input_type& mt_input)
		{
			return cur_mccwp.forward(mt_input);
		}

		input_type backward(const ret_type& delta)
		{
			return cur_mccwp.backward(delta);
		}

		void update_inert() 
		{
			cur_mccwp.update_inert();
		}

		using stretch_type = unite_mat_t<ret_type>;

		static stretch_type stretch(const ret_type& ret)
		{
			stretch_type mtd;
			auto p = mtd.pval->p;
			for (int i = 0; i < ret.r; ++i)
			{
				memcpy(p, ret[i].pval->p, sizeof(*p) * ret[i].r *  ret[i].c);
				p += (ret[i].r *  ret[i].c);
			}
			return mtd;
		}
	};
};

template<
	int channel_num, int input_row, int input_col, typename pad_t
	, template<typename> class update_method_templ
	, template<typename> class activate_func
	, typename tpl_init_method
	, int tpl_num
	, int tpl_row, int tpl_col
	, int row_step, int col_step
	, int pool_row, int pool_col
	, int ...rest
>
void write_file(const typename mccwp_cluster_input<channel_num, input_row, input_col, pad_t>::template mccwp_cluster<update_method_templ
	, activate_func
	, tpl_init_method
	, tpl_num
	, tpl_row, tpl_col
	, row_step, col_step
	, pool_row, pool_col
	, rest...>& mccwpc, ht_memory& mry)
{
	write_file(mccwpc.cur_mccwp, mry);
	if constexpr (0 != sizeof...(rest)) 
	{
		using cur_type = mccw_with_pool<
			channel_num, tpl_num
			, input_row, input_col
			, tpl_row, tpl_col
			, row_step, col_step
			, pool_row, pool_col
			, update_method_templ
			, activate_func
			, tpl_init_method
			, pad_t
		>;
		using next_input_type = typename cur_type::ret_type::type;
		write_file<
			tpl_num, next_input_type::r, next_input_type::c, pad_t
			, update_method_templ
			, activate_func
			, tpl_init_method
			, rest...
		>(mccwpc.next, mry);
	}
}

template<
	int channel_num, int input_row, int input_col, typename pad_t
	, template<typename> class update_method_templ
	, template<typename> class activate_func
	, typename tpl_init_method
	, int tpl_num
	, int tpl_row, int tpl_col
	, int row_step, int col_step
	, int pool_row, int pool_col
	, int ...rest
>
void read_file(ht_memory& mry, typename mccwp_cluster_input<channel_num, input_row, input_col, pad_t>::template mccwp_cluster<update_method_templ
	, activate_func
	, tpl_init_method
	, tpl_num
	, tpl_row, tpl_col
	, row_step, col_step
	, pool_row, pool_col
	, rest...>& mccwpc)
{
	read_file(mry, mccwpc.cur_mccwp);
	if constexpr (0 != sizeof...(rest))
	{
		using cur_type = mccw_with_pool<
			channel_num, tpl_num
			, input_row, input_col
			, tpl_row, tpl_col
			, row_step, col_step
			, pool_row, pool_col
			, update_method_templ
			, activate_func
			, tpl_init_method
			, pad_t
		>;
		using next_input_type = typename cur_type::ret_type::type;
		read_file<
			tpl_num, next_input_type::r, next_input_type::c, pad_t
			, update_method_templ
			, activate_func
			, tpl_init_method
			, rest...
		>(mry, mccwpc.next);
	}
}

template<
	template<typename> class judge_update_method_templ
	, template<typename> class judge_activate_func
	, typename judge_tpl_init_method
	, typename pad_t
	, int channel_num, int input_row, int input_col, int output_num
	, template<typename> class update_method_templ
	, template<typename> class activate_func
	, typename tpl_init_method
	, int...rest
>
struct cnn
{
	using mccwpc_type = typename mccwp_cluster_input<channel_num, input_row, input_col, pad_t>::template mccwp_cluster<
		update_method_templ, activate_func, tpl_init_method
		, rest...
	>;
	using input_type = typename mccwpc_type::input_type;

	using bp_input_type = typename mccwpc_type::stretch_type;
	using all_conn_type = bp<double, 1, judge_update_method_templ, ReLu, judge_tpl_init_method, bp_input_type::r, 128>;
	using bp_type = bp<double, 1, judge_update_method_templ, judge_activate_func, judge_tpl_init_method, 128, output_num>;

	using ret_type = typename bp_type::ret_type;

	mccwpc_type mccwpc_layer;
	all_conn_type ac_layer;
	bp_type bp_layer;

	ret_type forward(const input_type& mt)
	{
		auto mccwpc_out = mccwpc_layer.forward(mt);
		auto bp_input = mccwpc_type::stretch(mccwpc_out);
		return bp_layer.forward(ac_layer.forward(bp_input));
	}

	auto backward(const ret_type& delta)
	{
		auto bp_back = ac_layer.backward(bp_layer.backward(delta));
		auto bp_back_split = mccwpc_type::split(bp_back);
		return mccwpc_layer.backward(bp_back_split);
	}

	void update_inert() 
	{
		mccwpc_layer.update_inert();
		bp_layer.update_inert();
		ac_layer.update_inert();
	}
};

template<
	template<typename> class judge_update_method_templ
	, template<typename> class judge_activate_func
	, typename judge_tpl_init_method
	, typename pad_t
	, int channel_num, int input_row, int input_col, int output_num
	, template<typename> class update_method_templ
	, template<typename> class activate_func
	, typename tpl_init_method
	, int...rest
>
void write_file(const cnn<
		judge_update_method_templ
		, judge_activate_func
		, judge_tpl_init_method
		, pad_t
		, channel_num, input_row, input_col, output_num
		, update_method_templ
		, activate_func
		, tpl_init_method
		, rest...
	>& c, ht_memory& mry) 
{
	write_file<channel_num, input_row, input_col, pad_t
		, update_method_templ, activate_func, tpl_init_method
		, rest...>(c.mccwpc_layer, mry);
	write_file(c.ac_layer, mry);
	write_file(c.bp_layer, mry);
}

template<
	template<typename> class judge_update_method_templ
	, template<typename> class judge_activate_func
	, typename judge_tpl_init_method
	, typename pad_t
	, int channel_num, int input_row, int input_col, int output_num
	, template<typename> class update_method_templ
	, template<typename> class activate_func
	, typename tpl_init_method
	, int...rest
>
void read_file(ht_memory& mry, cnn<
	judge_update_method_templ
	, judge_activate_func
	, judge_tpl_init_method
	, pad_t
	, channel_num, input_row, input_col, output_num
	, update_method_templ
	, activate_func
	, tpl_init_method
	, rest...
>& c)
{
	read_file<channel_num, input_row, input_col, pad_t
		, update_method_templ, activate_func, tpl_init_method
		, rest...>(mry, c.mccwpc_layer);
	read_file(mry, c.ac_layer);
	read_file(mry, c.bp_layer);
}

#endif

CNN库导出代码（cnn_export）：


#include "cnn_export.h"


#include "mat.hpp"
#include "activate_function.hpp"
#include "weight_initilizer.hpp"
#include "update_methods.hpp"

#include "convolution_layer.hpp"
#include "bp.hpp"
#include "update_methods.hpp"

#include "cnn.hpp"

typedef cnn <
	nadam, softmax, XavierGaussian		// 判别层使用的更新方法、激活函数和初始化方法
	, same_pad
	, 1, 28, 28, 10						// 3通道、64*64图像、4输出的判别层
	, nadam, ReLu, HeGaussian			// 卷积池化层使用的更新方法、激活函数和初始化方法
	, 16									// 3卷积核
	, 5, 5								// 卷积核尺寸
	, 1, 1								// 卷积核步幅
	, 2, 2								// 池化层尺寸

	, 32								// 3卷积核
	, 5, 5								// 卷积核尺寸
	, 1, 1								// 卷积核步幅
	, 2, 2								// 池化层尺寸
> cnn_type;

cnn_type g_cnn;
mat<28, 28, double> g_mean1;
mat<28, 28, double> g_div1;

void CNN_EXPORT update_inert()
{
	g_cnn.update_inert();
}

mat<10, 1, double>CNN_EXPORT train_cnn(const mat<28, 28, double>& st_data, const mat<10, 1, double>& st_label)
{
	cnn_type::ret_type tmp;
	cnn_type::input_type in;
	in[0] = st_data;
	tmp = g_cnn.forward(in);
	auto delta = tmp - st_label;
	g_cnn.backward(delta);
	return delta;
}

void CNN_EXPORT train_cnn(const std::vector<mat<28, 28, double>>& vec1, const std::vector<mat<10, 1, double> >& vec_expected, const int& train_num, const double& d_repeat_threshold)
{
	cnn_type::ret_type tmp, lbl;

	//auto vecn1 = normalize(vec1, g_mean1, g_div1);
	cnn_type::input_type in;
	for (int i = 0; i < train_num; ++i) 
	{
		for (int j = 0; j < vec_expected.size(); ++j)
		{
			in[0] = vec1[j];
			tmp = g_cnn.forward(in);
			auto delta = tmp - vec_expected[j];
			g_cnn.backward(delta);
			if (d_repeat_threshold < delta.max_abs()) --j;
		}
	}
}

mat<10, 1, double> CNN_EXPORT predict(const mat<28, 28, double>& channel1)
{
	//mat<28, 28, double> mtn1 = (channel1 - g_mean1) / g_div1;
	mat<28, 28, double> mtn1 = (channel1);
	cnn_type::input_type in;
	in[0] = mtn1;
	return g_cnn.forward(in);
}

void CNN_EXPORT read_cnn(const char * cstr_path)
{
	ht_memory mry_cnn_data(system_endian());
	mry_cnn_data.read_file(cstr_path);
	read_file(mry_cnn_data, g_cnn);
}

void CNN_EXPORT write_cnn(const char * cstr_path)
{
	ht_memory mry_save(system_endian());
	write_file(g_cnn, mry_save);
	mry_save.write_file(cstr_path);
	
}

训练程序：

#include <conio.h>

#include "common_tools.h"
#include "mat.hpp"
#include "activate_function.hpp"
#include "weight_initilizer.hpp"
#include "update_methods.hpp"

#include "convolution_layer.hpp"
#include "bp.hpp"
#include "update_methods.hpp"

#include <iostream>
#include <string>
#include <random>
#include <algorithm>
#include "ht_memory.h"
#include "mat.hpp"
#include <opencv2/opencv.hpp>
#include <boost/timer.hpp>
#include <boost/filesystem.hpp>
#include <boost/progress.hpp>

#include "cnn_export.h"

void assign_mat(mat<28, 28, double>& mt, const unsigned char* sz)
{
	int i_sz_cnt = 0;
	for (int r = 0; r < 28; ++r)
	{
		for (int c = 0; c < 28; ++c)
		{
			mt.get(r, c) = sz[i_sz_cnt++];
		}
	}
}

struct train_data
{
	mat<28, 28, double> mt_image;
	mat<10, 1, double> mt_label;
	int					i_num;
};

void print_time(const double& sec)
{
	double d = sec;
	if (d >= 3600.)
	{
		double h = floor(d / 3600.);
		std::cout << h << "h";
		d = d - h * 3600.;
	}
	if (d >= 60)
	{
		double m = floor(d / 60.);
		std::cout << m << "min";
		d = d - m * 60.;
	}
	if (d > 0)
	{
		std::cout << d << "sec" << std::endl;
	}
}


const char* g_cstr_file_name = "./cnn_data.dat";

using namespace boost;
using namespace boost::filesystem;
using namespace cv;

template<int N, typename val_t>
Mat convert2cv(const mat<N, N, val_t>& mt)
{
	Mat mtret = Mat::zeros(N, N, CV_8UC1);
	for (int r = 0; r < N; ++r)
	{
		for (int c = 0; c < N; ++c)
		{
			mtret.at<uchar>(r, c) = mt.get(r, c);
		}
	}
	return mtret;
}

template<int row_num, int col_num, typename val_t>
void max_idx(int& max_r, int& max_c, val_t& max_val, const mat<row_num, col_num, val_t>& mt)
{
	max_val = 1e-7;
	max_r = 0;
	max_c = 0;
	for (int r = 0; r < row_num; ++r)
	{
		for (int c = 0; c < col_num; ++c)
		{
			if (max_val < mt.get(r, c))
			{
				max_val = mt.get(r, c);
				max_r = r;
				max_c = c;
			}
		}
	}
}

int main(int argc, char** argv)
{
	unsigned char sz_image_buf[28 * 28];

	std::vector<train_data> vec_train_data;

	ht_memory mry_train_images(ht_memory::big_endian);
	mry_train_images.read_file("./data/train-images.idx3-ubyte");
	int32_t i_image_magic_num = 0, i_image_num = 0, i_image_col_num = 0, i_image_row_num = 0;
	mry_train_images >> i_image_magic_num >> i_image_num >> i_image_row_num >> i_image_col_num;
	printf("magic num:%d | image num:%d | image_row:%d | image_col:%d\r\n"
		, i_image_magic_num, i_image_num, i_image_row_num, i_image_col_num);

	ht_memory mry_train_labels(ht_memory::big_endian);
	mry_train_labels.read_file("./data/train-labels.idx1-ubyte");
	int32_t i_label_magic_num = 0, i_label_num = 0;
	mry_train_labels >> i_label_magic_num >> i_label_num;

	for (int i = 0; i < i_image_num; ++i)
	{
		memset(sz_image_buf, 0, sizeof(sz_image_buf));
		train_data td;
		unsigned char uc_label = 0;
		mry_train_images.read((char*)sz_image_buf, sizeof(sz_image_buf));
		assign_mat(td.mt_image, sz_image_buf);
		td.mt_image = td.mt_image / 256.;
		mry_train_labels >> uc_label;
		td.i_num = uc_label;
		td.mt_label.get((int)uc_label, 0) = 1;
		vec_train_data.push_back(td);
	}

	std::string str_train_times;
	std::cout << "train times:";
	std::getline(std::cin, str_train_times);

	int i_train_times = std::stol(str_train_times);

	std::cout << "train data set size:";
	std::string str_train_data_set_size;
	std::getline(std::cin, str_train_data_set_size);
	int i_train_data_set_size = std::stol(str_train_data_set_size);

	std::cout << "repeat threshold:";
	std::string str_repeat_threshold;
	std::getline(std::cin, str_repeat_threshold);
	double dthreshold = std::stod(str_repeat_threshold);

	path pth(g_cstr_file_name);
	if (exists(pth))
	{
		printf("read cnn data from file '%s'\r\n", g_cstr_file_name);
		read_cnn(g_cstr_file_name);
	}
	else
	{
		printf("init cnn with default data\r\n");
	}
	double dd = 0.0;

	std::random_device rd;
	std::mt19937 rng(rd());

	std::shuffle(vec_train_data.begin(), vec_train_data.end(), rng);
	std::vector<train_data> vec_train_set(vec_train_data.begin(), vec_train_data.begin() + i_train_data_set_size);
	progress_display pd(i_train_times * i_train_data_set_size);
	boost::timer tmr;
	std::vector<mat<28, 28, double> > vec_datas;
	std::vector<mat<10, 1, double> > vec_labels;
	for (int i = 0; i < i_train_times; ++i)
	{
		int i_right = 0, i_error = 0;
		std::random_device rd;
		std::mt19937 rng(rd());
		std::shuffle(vec_train_set.begin(), vec_train_set.end(), rng);
		int i_num = 0;
		for (int j = 0; j < i_train_data_set_size; ++j)
		{
			auto delta = train_cnn(vec_train_set[j].mt_image, vec_train_set[j].mt_label);
			auto out = delta + vec_train_set[j].mt_label;

			if (delta.max_abs() > dthreshold && i_num < 2)
			{
				i_num++;
				j--;			// * 誤差太大，重新訓練
			}
			else
			{
				if (i_num != 0)
				{
					update_inert();
				}
				else
				{
					double d_max = 0.0;
					int tmp_r = 0, tmp_c = 0;
					max_idx(tmp_r, tmp_c, d_max, out);
					if (tmp_r == vec_train_set[j].i_num)
					{
						++i_right;
					}
					else
					{
						++i_error;
					}
				}
				i_num = 0;
				++pd;
			}
		}
		write_cnn(g_cstr_file_name);
		dd = static_cast<double>(i_right) / static_cast<double>(i_right + i_error);
	}

	auto t = tmr.elapsed();
	printf("\r\ntrain spend:");
	print_time(t);

	printf("match rate:%lf\r\n", dd);
	write_cnn(g_cstr_file_name);
	printf("\r\nwrite cnn data to '%s' finish\r\n", g_cstr_file_name);

	std::vector<train_data> vec_test_data;

	std::cout << "test set(1:train data;2:test_set):" << std::endl;
	std::string str_test_set;
	std::getline(std::cin, str_test_set);
	trim(str_test_set);
	if (str_test_set == "2")
	{
		ht_memory mry_test_images(ht_memory::big_endian);
		mry_test_images.read_file("./data/t10k-images.idx3-ubyte");
		int32_t i_test_magic_num = 0, i_test_num = 0, i_test_col_num = 0, i_test_row_num = 0;
		mry_test_images >> i_test_magic_num >> i_test_num >> i_test_row_num >> i_test_col_num;
		printf("test data:\r\n\tmagic_num:%d, data_num:%d, data_row:%d, data_col:%d\r\n", i_test_magic_num, i_test_num, i_test_row_num, i_test_col_num);

		ht_memory mry_test_labels(ht_memory::big_endian);
		mry_test_labels.read_file("./data/t10k-labels.idx1-ubyte");
		int32_t i_test_magic_label_num = 0, i_test_label_num = 0;
		mry_test_labels >> i_test_magic_label_num >> i_test_label_num;



		for (int i = 0; i < i_test_num; ++i)
		{
			train_data td;
			unsigned char uc_label = 0;
			memset(sz_image_buf, 0, sizeof(sz_image_buf));
			mry_test_images.read((char*)sz_image_buf, sizeof(sz_image_buf));
			assign_mat(td.mt_image, sz_image_buf);
			td.mt_image = td.mt_image / 256.;
			mry_test_labels >> uc_label;
			td.mt_label.get((int)uc_label, 0) = 1;
			vec_test_data.push_back(td);
		}
	}
	else
	{
		vec_test_data = vec_train_set;
	}

	std::shuffle(vec_test_data.begin(), vec_test_data.end(), rng);

	std::cout << "test num:";
	std::string str_test_num;
	std::getline(std::cin, str_test_num);

	int i_test_sample_num = std::stol(str_test_num);

	int max_r = 0, max_c = 0;
	double max_v = 0.;
	for (int i = 0; i < i_test_sample_num; ++i)
	{
		auto ret = predict(vec_test_data[i].mt_image);
		max_idx(max_r, max_c, max_v, ret);
		printf("output:%d ", max_r);
		double dtmp;
		max_idx(max_r, max_c, dtmp, vec_test_data[i].mt_label);
		printf(" label:%d", max_r);
		printf(" possi:%lf \r\n", max_v);
		if (max_v > (1. - dthreshold))
		{
			Mat mt_show = convert2cv(vec_test_data[i].mt_image*256.);
			imshow("image 1", mt_show);
			char c = waitKey();
		}
	}
	_getch();
	return 0;
}

编译元编程程序很慢，我试了200个训练集，训练100次，然后用测试集合中取100个进行测试。结果大部分还是正确的。