ACTF2025 Reverse Deeptx WP

前言

又是和aliyunctf一样的Cuda图片解密题,不过这次第一层加密上了运动模糊卷积加密,导致图片解密到这一层没法再解密了,只能利用运动模糊还原算法来让图片尽可能清晰。这题因为Cuda的Sync卡了一天,没注意到Layer3加密中的几个Sync汇编,导致加密还原完全正确,但加密执行的流程错了。

分析

程序载入ida看到是cuda代码相关,这边复制了sbox、tbox、motion三个数组数据到device,读入了flag.bmp并跳过前面的bmp格式数据,载入了0x10000大小的数据,即256*256,进行了三次Layer加密计算,最后输出到图片。

alt text

直接用cuobjdump进行dump出ptx汇编代码。

alt text

ptx汇编代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
Fatbin elf code:
================
arch = sm_86
code version = [1,7]
host = linux
compile_size = 64bit

Fatbin elf code:
================
arch = sm_86
code version = [1,7]
host = linux
compile_size = 64bit

Fatbin ptx code:
================
arch = sm_86
code version = [8,7]
host = linux
compile_size = 64bit
compressed
ptxasOptions =

//
//
//
//
//
//

.version 8.7
.target sm_86
.address_size 64

//
.const .align 1 .b8 cuda_sbox[256];
.const .align 1 .b8 cuda_tbox[256];
.const .align 4 .b8 cuda_motion[1024];

.visible .entry _Z6Layer1PhS_(
.param .u64 _Z6Layer1PhS__param_0,
.param .u64 _Z6Layer1PhS__param_1
)
{
.reg .pred %p<6>;
.reg .b16 %rs<2>;
.reg .f32 %f<12>;
.reg .b32 %r<23>;
.reg .b64 %rd<15>;


ld.param.u64 %rd5, [_Z6Layer1PhS__param_0];
ld.param.u64 %rd6, [_Z6Layer1PhS__param_1];
mov.u32 %r1, %tid.x;
setp.lt.u32 %p1, %r1, 241;
mov.u32 %r2, %ctaid.x;
setp.lt.u32 %p2, %r2, 241;
and.pred %p3, %p1, %p2;
@%p3 bra $L__BB0_2;
bra.uni $L__BB0_1;

$L__BB0_2:
mov.u32 %r3, %ntid.x;
cvta.to.global.u64 %rd1, %rd5;
mov.f32 %f10, 0f00000000;
mov.u32 %r11, 0;
mov.u64 %rd8, cuda_motion;
mov.u32 %r20, %r11;

$L__BB0_3:
.pragma "nounroll";
add.s32 %r13, %r20, %r2;
shl.b32 %r14, %r20, 4;
mov.u32 %r15, 240;
sub.s32 %r16, %r15, %r14;
mad.lo.s32 %r21, %r13, %r3, %r1;
mul.wide.u32 %rd7, %r16, 4;
add.s64 %rd14, %rd8, %rd7;
mov.u32 %r22, %r11;

$L__BB0_4:
.pragma "nounroll";
cvt.u64.u32 %rd9, %r21;
add.s64 %rd10, %rd1, %rd9;
ld.global.u8 %rs1, [%rd10];
cvt.rn.f32.u16 %f7, %rs1;
ld.const.f32 %f8, [%rd14];
fma.rn.f32 %f10, %f8, %f7, %f10;
add.s32 %r21, %r21, 1;
add.s64 %rd14, %rd14, 4;
add.s32 %r22, %r22, 1;
setp.ne.s32 %p4, %r22, 16;
@%p4 bra $L__BB0_4;

add.s32 %r20, %r20, 1;
setp.lt.u32 %p5, %r20, 16;
@%p5 bra $L__BB0_3;
bra.uni $L__BB0_6;

$L__BB0_1:
mov.f32 %f10, 0f00000000;

$L__BB0_6:
cvt.rzi.u32.f32 %r17, %f10;
mov.u32 %r18, %ntid.x;
mad.lo.s32 %r19, %r2, %r18, %r1;
cvt.u64.u32 %rd11, %r19;
cvta.to.global.u64 %rd12, %rd6;
add.s64 %rd13, %rd12, %rd11;
st.global.u8 [%rd13], %r17;
ret;

}
//
.visible .entry _Z6Layer2PhS_(
.param .u64 _Z6Layer2PhS__param_0,
.param .u64 _Z6Layer2PhS__param_1
)
{
.reg .b16 %rs<2>;
.reg .b32 %r<8>;
.reg .b64 %rd<14>;


ld.param.u64 %rd1, [_Z6Layer2PhS__param_0];
ld.param.u64 %rd2, [_Z6Layer2PhS__param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r1, %r2, %r3;
cvt.u64.u32 %rd5, %r4;
add.s64 %rd6, %rd4, %rd5;
ld.global.u8 %rs1, [%rd6];
cvt.u64.u32 %rd7, %r3;
mov.u64 %rd8, cuda_sbox;
add.s64 %rd9, %rd8, %rd7;
ld.const.u8 %r5, [%rd9];
cvt.u64.u32 %rd10, %r1;
add.s64 %rd11, %rd8, %rd10;
ld.const.u8 %r6, [%rd11];
mad.lo.s32 %r7, %r2, %r5, %r6;
cvt.u64.u32 %rd12, %r7;
add.s64 %rd13, %rd3, %rd12;
st.global.u8 [%rd13], %rs1;
ret;

}
//
.visible .entry _Z6Layer3PhS_(
.param .u64 _Z6Layer3PhS__param_0,
.param .u64 _Z6Layer3PhS__param_1
)
{
.reg .pred %p<5>;
.reg .b16 %rs<33>;
.reg .b32 %r<52>;
.reg .b64 %rd<24>;


ld.param.u64 %rd6, [_Z6Layer3PhS__param_0];
ld.param.u64 %rd5, [_Z6Layer3PhS__param_1];
mov.u32 %r21, %ntid.x;
mov.u32 %r1, %ctaid.x;
mul.lo.s32 %r49, %r1, %r21;
mov.u32 %r3, %tid.x;
add.s32 %r22, %r49, %r3;
cvt.u64.u32 %rd1, %r22;
cvta.to.global.u64 %rd2, %rd6;
add.s64 %rd3, %rd2, %rd1;
cvt.u16.u32 %rs8, %r3;
cvt.u16.u32 %rs9, %r1;
or.b16 %rs10, %rs9, %rs8;
ld.global.u8 %rs11, [%rd3];
xor.b16 %rs12, %rs11, %rs10;
st.global.u8 [%rd3], %rs12;
bar.sync 0;
and.b32 %r23, %r3, 7;
setp.ne.s32 %p1, %r23, 0;
@%p1 bra $L__BB2_4;

ld.global.u32 %r47, [%rd3+4];
ld.global.u32 %r48, [%rd3];
mov.u32 %r46, 1786956040;
mov.u32 %r45, 0;

$L__BB2_2:
.pragma "nounroll";
shl.b32 %r26, %r48, 4;
add.s32 %r27, %r26, 1386807340;
shr.u32 %r28, %r48, 5;
add.s32 %r29, %r28, 2007053320;
xor.b32 %r30, %r29, %r27;
add.s32 %r31, %r48, %r46;
xor.b32 %r32, %r30, %r31;
add.s32 %r47, %r32, %r47;
shl.b32 %r33, %r47, 4;
add.s32 %r34, %r33, 621668851;
add.s32 %r35, %r46, %r47;
xor.b32 %r36, %r34, %r35;
shr.u32 %r37, %r47, 5;
add.s32 %r38, %r37, -862448841;
xor.b32 %r39, %r36, %r38;
sub.s32 %r48, %r48, %r39;
add.s32 %r46, %r46, -1708609273;
add.s32 %r45, %r45, 1;
setp.ne.s32 %p2, %r45, 3238567;
@%p2 bra $L__BB2_2;

st.global.u32 [%rd3], %r48;
st.global.u32 [%rd3+4], %r47;

$L__BB2_4:
bar.sync 0;
and.b16 %rs16, %rs9, %rs8;
ld.global.u8 %rs17, [%rd3];
xor.b16 %rs18, %rs17, %rs16;
st.global.u8 [%rd3], %rs18;
bar.sync 0;
cvt.u64.u32 %rd7, %r3;
mov.u64 %rd8, cuda_sbox;
add.s64 %rd9, %rd8, %rd7;
ld.const.u8 %rs31, [%rd9];
cvta.to.global.u64 %rd4, %rd5;
mov.u16 %rs32, 0;
mov.u32 %r50, 0;
mov.u64 %rd14, cuda_tbox;

$L__BB2_5:
.pragma "nounroll";
cvt.u64.u32 %rd10, %r49;
add.s64 %rd11, %rd2, %rd10;
cvt.u64.u16 %rd12, %rs31;
and.b64 %rd13, %rd12, 255;
add.s64 %rd15, %rd14, %rd13;
ld.const.u8 %rs19, [%rd15];
ld.global.u8 %rs20, [%rd11];
mul.lo.s16 %rs21, %rs19, %rs20;
add.s16 %rs32, %rs21, %rs32;
mul.lo.s16 %rs22, %rs31, 5;
add.s16 %rs31, %rs22, 17;
add.s32 %r49, %r49, 1;
add.s32 %r50, %r50, 1;
setp.ne.s32 %p3, %r50, 256;
@%p3 bra $L__BB2_5;

xor.b32 %r18, %r1, %r3;
mov.u32 %r51, 8;

$L__BB2_7:
.pragma "nounroll";
shl.b16 %rs23, %rs32, 3;
and.b16 %rs24, %rs32, 224;
shr.u16 %rs25, %rs24, 5;
or.b16 %rs26, %rs25, %rs23;
cvt.u32.u16 %r42, %rs26;
mad.lo.s32 %r43, %r42, 13, %r18;
and.b32 %r44, %r51, 255;
cvt.u64.u32 %rd16, %r44;
add.s64 %rd18, %rd14, %rd16;
cvt.u16.u32 %rs27, %r43;
ld.const.u8 %rs28, [%rd18];
xor.b16 %rs29, %rs28, %rs27;
cvt.u64.u16 %rd19, %rs29;
and.b64 %rd20, %rd19, 255;
add.s64 %rd22, %rd8, %rd20;
ld.const.u8 %rs32, [%rd22];
add.s32 %r51, %r51, 1;
setp.ne.s32 %p4, %r51, 4137823;
@%p4 bra $L__BB2_7;

add.s64 %rd23, %rd4, %rd1;
st.global.u8 [%rd23], %rs32;
ret;

}

按ptx汇编逐行还原三个layer加密函数。

bar.sync 0;这几行汇编很重要,会让所有gpu线程都同步执行到这边才继续往下进行加密。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>

__constant__ uint8_t cuda_sbox[256];
__constant__ uint8_t cuda_tbox[256];
__constant__ float cuda_motion[256];

__global__ void Layer1(uint8_t* Input, uint8_t* Output)
{
int tid = threadIdx.x;
int bid = blockIdx.x;
int bdim = blockDim.x;

if (tid >= 241 || bid >= 241)
return;

float sum = 0.f;
for (int i = 0; i < 16; i++)
{
for (int j = 0; j < 16; j++)
{
int Index = (i + bid) * bdim + (tid + j);
sum += cuda_motion[240 - (i * 16) + j] * (float)(Input[Index]);
}
}

Output[bid * bdim + tid] = (uint8_t)(sum);
}

__global__ void Layer2(uint8_t* Input, uint8_t* Output)
{
int tid = threadIdx.x;
int bid = blockIdx.x;
int bdim = blockDim.x;

uint8_t Value = Input[bid * bdim + tid];
int Index = bdim * cuda_sbox[tid] + cuda_sbox[bid];
Output[Index] = Value;
}

__global__ void Layer3(uint8_t* Input, uint8_t* Output)
{
int tid = threadIdx.x;
int bid = blockIdx.x;
int bdim = blockDim.x;

int CurIndex = bdim * bid + tid;

Input[CurIndex] ^= tid | bid;

__syncthreads();

if ((CurIndex & 7) == 0)
{
uint32_t v0 = *(uint32_t*)(Input + CurIndex);
uint32_t v1 = *(uint32_t*)(Input + CurIndex + 4);
uint32_t sum = 1786956040;
for (int i = 0; i < 3238567; i++)
{
v1 += ((v0 << 4) + 1386807340) ^ ((v0 >> 5) + 2007053320) ^ (v0 + sum);
v0 -= ((v1 << 4) + 621668851) ^ ((v1 >> 5) - 862448841) ^ (v1 + sum);
sum += -1708609273;
}
*(uint32_t*)(Input + CurIndex) = v0;
*(uint32_t*)(Input + CurIndex + 4) = v1;
}
__syncthreads();

Input[CurIndex] ^= bid & tid;

__syncthreads();

uint8_t tmp = cuda_sbox[tid];
uint16_t v = 0;

for (int i = 0; i < 256; i++)
{
v += cuda_tbox[tmp & 0xff] * Input[bid * bdim + i];
tmp = tmp * 5 + 17;
}

for (int i = 8; i < 4137823; i++)
{
uint32_t tmp1 = (v << 3) | ((v & 224) >> 5);
int tmp2 = tmp1 * 13 + (tid ^ bid);
v = cuda_sbox[(cuda_tbox[i & 0xff] ^ tmp2) & 0xff];
}

Output[CurIndex] = (uint8_t)v;
}

写出Layer3和Layer2的解密函数,Layer1是运动模糊卷积算法,不可逆,只能最后用算法还原清晰点。

Layer3这边倒数第二个那个256的循环是在python用z3解,其余部分都直接用cuda代码解密即可。

底下用到的cuda_invsbox是sbox算出的逆sbox数据。

cuda解密代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
__constant__ uint8_t cuda_sbox[256];
__constant__ uint8_t cuda_tbox[256];
__constant__ uint8_t cuda_invsbox[256];

__global__ void Re_Layer2(uint8_t* Input, uint8_t* Output)
{
int tid = threadIdx.x;
int bid = blockIdx.x;
int bdim = blockDim.x;

int oTid = cuda_invsbox[bid];
int oBid = cuda_invsbox[tid];

uint8_t Value = Input[bid * bdim + tid];

Output[oBid * bdim + oTid] = Value;
}

__global__ void Re_Layer3_part1(uint8_t* Input, uint8_t* Output)
{
int tid = threadIdx.x;
int bid = blockIdx.x;
int bdim = blockDim.x;
int CurIndex = bdim * bid + tid;

// 0xC5为13的逆元
uint8_t inv_13 = 0xC5;
uint16_t v = Input[CurIndex];
for (int i = 4137823 - 1; i >= 8; i--)
{
v = cuda_invsbox[v & 0xff];
int tmp2 = v ^ cuda_tbox[i & 0xff];
int tmp1 = ((tmp2 - (tid ^ bid)) * inv_13) & 0xff;
v = (tmp1 >> 3) | (tmp1 << 5);
}

Output[CurIndex] = v & 0xff;
}

__global__ void Re_Layer3_part2(uint8_t* Input, uint8_t* Output)
{
int tid = threadIdx.x;
int bid = blockIdx.x;
int bdim = blockDim.x;
int CurIndex = bdim * bid + tid;

Input[CurIndex] ^= bid & tid;

__syncthreads();

if ((CurIndex & 7) == 0)
{
uint32_t v0 = *(uint32_t*)(Input + CurIndex);
uint32_t v1 = *(uint32_t*)(Input + CurIndex + 4);
uint32_t sum = 1786956040 + (-1708609273) * 3238567;
for (int i = 0; i < 3238567; i++)
{
sum -= -1708609273;
v0 += ((v1 << 4) + 621668851) ^ ((v1 >> 5) - 862448841) ^ (v1 + sum);
v1 -= ((v0 << 4) + 1386807340) ^ ((v0 >> 5) + 2007053320) ^ (v0 + sum);
}
*(uint32_t*)(Input + CurIndex) = v0;
*(uint32_t*)(Input + CurIndex + 4) = v1;
}

__syncthreads();

Input[CurIndex] ^= tid | bid;

Output[CurIndex] = Input[CurIndex];
}

第一层Layer3解密代码调用部分,只先解密最后那个大循环,然后256循环部分转到python进行z3求解。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
unsigned char sbox[] =
{
0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6,
0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05, 0x2B, 0x67, 0x9A, 0x76,
0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86,
0x06, 0x99, 0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A,
0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, 0xE4, 0xB3,
0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA,
0x75, 0x8F, 0x3F, 0xA6, 0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73,
0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8,
0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB,
0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, 0x1E, 0x24, 0x0E, 0x5E,
0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21,
0x78, 0x87, 0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52,
0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E, 0xEA, 0xBF,
0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE,
0xF9, 0x61, 0x15, 0xA1, 0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34,
0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3,
0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29,
0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, 0xD5, 0xDB, 0x37, 0x45,
0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C,
0x5B, 0x51, 0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F,
0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8, 0x0A, 0xC1,
0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12,
0xB8, 0xE5, 0xB4, 0xB0, 0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96,
0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84,
0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE,
0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48
};

unsigned char tbox[] =
{
0x62, 0x7C, 0x76, 0x7A, 0xF2, 0x6A, 0x6E, 0xC4, 0x30, 0x00,
0x66, 0x2A, 0xFE, 0xD6, 0xAA, 0x76, 0xCA, 0x82, 0xC8, 0x7C,
0xFA, 0x58, 0x46, 0xF0, 0xAC, 0xD4, 0xA2, 0xAE, 0x9C, 0xA4,
0x72, 0xC0, 0xB6, 0xFC, 0x92, 0x26, 0x36, 0x3E, 0xF6, 0xCC,
0x34, 0xA4, 0xE4, 0xF0, 0x70, 0xD8, 0x30, 0x14, 0x04, 0xC6,
0x22, 0xC2, 0x18, 0x96, 0x04, 0x9A, 0x06, 0x12, 0x80, 0xE2,
0xEA, 0x26, 0xB2, 0x74, 0x08, 0x82, 0x2C, 0x1A, 0x1A, 0x6E,
0x5A, 0xA0, 0x52, 0x3A, 0xD6, 0xB2, 0x28, 0xE2, 0x2E, 0x84,
0x52, 0xD0, 0x00, 0xEC, 0x20, 0xFC, 0xB0, 0x5A, 0x6A, 0xCA,
0xBE, 0x38, 0x4A, 0x4C, 0x58, 0xCE, 0xD0, 0xEE, 0xAA, 0xFA,
0x42, 0x4C, 0x32, 0x84, 0x44, 0xF8, 0x02, 0x7E, 0x50, 0x3C,
0x9E, 0xA8, 0x50, 0xA2, 0x40, 0x8E, 0x92, 0x9C, 0x38, 0xF4,
0xBC, 0xB6, 0xDA, 0x20, 0x10, 0xFE, 0xF2, 0xD2, 0xCC, 0x0C,
0x12, 0xEC, 0x5E, 0x96, 0x44, 0x16, 0xC4, 0xA6, 0x7E, 0x3C,
0x64, 0x5C, 0x18, 0x72, 0x60, 0x80, 0x4E, 0xDC, 0x22, 0x2A,
0x90, 0x88, 0x46, 0xEF, 0xB8, 0x14, 0xDE, 0x5E, 0x0A, 0xDA,
0xE0, 0x32, 0x3A, 0x0A, 0x48, 0x06, 0x24, 0x5C, 0xC2, 0xD2,
0xAC, 0x62, 0x90, 0x94, 0xE4, 0x78, 0xE6, 0xC8, 0x36, 0x6C,
0x8C, 0xD4, 0x4E, 0xA8, 0x6C, 0x56, 0xF4, 0xEA, 0x64, 0x7A,
0xAE, 0x08, 0xBA, 0x78, 0x24, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6,
0xE8, 0xDC, 0x74, 0x1E, 0x4A, 0xBC, 0x8A, 0x8A, 0x70, 0x3E,
0xB4, 0x66, 0x48, 0x02, 0xF6, 0x0E, 0x60, 0x34, 0x56, 0xB8,
0x86, 0xC0, 0x1C, 0x9E, 0xE0, 0xF8, 0x98, 0x10, 0x68, 0xD8,
0x8E, 0x94, 0x9A, 0x1E, 0x86, 0xE8, 0xCE, 0x54, 0x28, 0xDE,
0x8C, 0xA0, 0x88, 0x0C, 0xBE, 0xE6, 0x42, 0x68, 0x40, 0x98,
0x2C, 0x0E, 0xB0, 0x54, 0xBA, 0x16
};

int main()
{
for (int i = 0; i < 256; i++)
invSbox[sbox[i]] = i;

uint8_t* Input = new uint8_t[256 * 256];

readFile("deep_flag.bmp", Input, 14 + 40 + 1024, 0x10000);
cudaMemcpyToSymbol(cuda_sbox, sbox, 256, 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(cuda_tbox, tbox, 256, 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(cuda_invsbox, invSbox, 256, 0, cudaMemcpyHostToDevice);
uint8_t* InPtr{}, * OutPtr{};

cudaMalloc(&InPtr, 0x10000);
cudaMalloc(&OutPtr, 0x10000);

cudaMemcpy(InPtr, Input, 0x10000, cudaMemcpyHostToDevice);

dim3 Blocks(256, 1, 1), Threads(256, 1, 1);
Re_Layer3_part1 << <Blocks, Threads >> > (InPtr, OutPtr);
cudaDeviceSynchronize();

cudaMemcpy(Input, OutPtr, 0x10000, cudaMemcpyDeviceToHost);

writeToFile("Steg2", Input, 0x10000);

cudaFree(InPtr);
cudaFree(OutPtr);

delete[] Input;

return 0;
}

z3求解部分代码:

直接python a.py num这样运行,最后num是下标,这边一共将0x10000数据分为16块,也就是16个进程同时跑,所以运行脚本num得从0-15都一起运行,最后生成16份分块的解密数据。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from z3 import*
import sys

sbox = [
0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6,
0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05, 0x2B, 0x67, 0x9A, 0x76,
0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86,
0x06, 0x99, 0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A,
0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, 0xE4, 0xB3,
0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA,
0x75, 0x8F, 0x3F, 0xA6, 0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73,
0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8,
0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB,
0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, 0x1E, 0x24, 0x0E, 0x5E,
0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21,
0x78, 0x87, 0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52,
0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E, 0xEA, 0xBF,
0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE,
0xF9, 0x61, 0x15, 0xA1, 0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34,
0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3,
0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29,
0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, 0xD5, 0xDB, 0x37, 0x45,
0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C,
0x5B, 0x51, 0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F,
0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8, 0x0A, 0xC1,
0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12,
0xB8, 0xE5, 0xB4, 0xB0, 0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96,
0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84,
0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE,
0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48
]

tbox = [
0x62, 0x7C, 0x76, 0x7A, 0xF2, 0x6A, 0x6E, 0xC4, 0x30, 0x00,
0x66, 0x2A, 0xFE, 0xD6, 0xAA, 0x76, 0xCA, 0x82, 0xC8, 0x7C,
0xFA, 0x58, 0x46, 0xF0, 0xAC, 0xD4, 0xA2, 0xAE, 0x9C, 0xA4,
0x72, 0xC0, 0xB6, 0xFC, 0x92, 0x26, 0x36, 0x3E, 0xF6, 0xCC,
0x34, 0xA4, 0xE4, 0xF0, 0x70, 0xD8, 0x30, 0x14, 0x04, 0xC6,
0x22, 0xC2, 0x18, 0x96, 0x04, 0x9A, 0x06, 0x12, 0x80, 0xE2,
0xEA, 0x26, 0xB2, 0x74, 0x08, 0x82, 0x2C, 0x1A, 0x1A, 0x6E,
0x5A, 0xA0, 0x52, 0x3A, 0xD6, 0xB2, 0x28, 0xE2, 0x2E, 0x84,
0x52, 0xD0, 0x00, 0xEC, 0x20, 0xFC, 0xB0, 0x5A, 0x6A, 0xCA,
0xBE, 0x38, 0x4A, 0x4C, 0x58, 0xCE, 0xD0, 0xEE, 0xAA, 0xFA,
0x42, 0x4C, 0x32, 0x84, 0x44, 0xF8, 0x02, 0x7E, 0x50, 0x3C,
0x9E, 0xA8, 0x50, 0xA2, 0x40, 0x8E, 0x92, 0x9C, 0x38, 0xF4,
0xBC, 0xB6, 0xDA, 0x20, 0x10, 0xFE, 0xF2, 0xD2, 0xCC, 0x0C,
0x12, 0xEC, 0x5E, 0x96, 0x44, 0x16, 0xC4, 0xA6, 0x7E, 0x3C,
0x64, 0x5C, 0x18, 0x72, 0x60, 0x80, 0x4E, 0xDC, 0x22, 0x2A,
0x90, 0x88, 0x46, 0xEF, 0xB8, 0x14, 0xDE, 0x5E, 0x0A, 0xDA,
0xE0, 0x32, 0x3A, 0x0A, 0x48, 0x06, 0x24, 0x5C, 0xC2, 0xD2,
0xAC, 0x62, 0x90, 0x94, 0xE4, 0x78, 0xE6, 0xC8, 0x36, 0x6C,
0x8C, 0xD4, 0x4E, 0xA8, 0x6C, 0x56, 0xF4, 0xEA, 0x64, 0x7A,
0xAE, 0x08, 0xBA, 0x78, 0x24, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6,
0xE8, 0xDC, 0x74, 0x1E, 0x4A, 0xBC, 0x8A, 0x8A, 0x70, 0x3E,
0xB4, 0x66, 0x48, 0x02, 0xF6, 0x0E, 0x60, 0x34, 0x56, 0xB8,
0x86, 0xC0, 0x1C, 0x9E, 0xE0, 0xF8, 0x98, 0x10, 0x68, 0xD8,
0x8E, 0x94, 0x9A, 0x1E, 0x86, 0xE8, 0xCE, 0x54, 0x28, 0xDE,
0x8C, 0xA0, 0x88, 0x0C, 0xBE, 0xE6, 0x42, 0x68, 0x40, 0x98,
0x2C, 0x0E, 0xB0, 0x54, 0xBA, 0x16
]

def solve_input(vArray,Size):
s = Solver()

Input = [[BitVec(f'input_{i}_{j}', 8) for j in range(256)] for i in range(Size)]

for bid in range(Size):
for tid in range(256):
tmp = sbox[tid]
sum = 0
for i in range(256):
sum += (tbox[tmp&0xff] * Input[bid][i])&0xffff
tmp = (tmp*5+17)&0xff

s.add(sum == vArray[bid*256+tid])

if s.check() == sat:
model = s.model()
result = [[model.evaluate(Input[i][j]).as_long()
for j in range(256)]
for i in range(Size)]
return result
else:
return None

num = int(sys.argv[1])
with open(r'Steg2' , 'rb') as f:
data = f.read(0x10000)

with open(r'Steg3_part'+str(num),'wb') as out_f:
start = num * 16 * 256
print('Running...')
for i in range(16):
block = data[start + i * 256 : start + i * 256 + 256 * 1]
In_ = [b for b in block]

result = solve_input(In_, 1)

if result:
for row in result:
out_f.write(bytes(row))

print('Finished ' + str(i) + ' block')
print('Finished all.')

最后Layer3前半部分解密+Layer2解密cuda代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
void readFile(const std::string& filename, uint8_t* buffer, size_t startPos, size_t bufferSize)
{
std::ifstream file(filename, std::ios::binary);
if (!file.is_open())
{
return;
}

file.seekg(startPos);

file.read(reinterpret_cast<char*>(buffer), bufferSize);
}

void writeToFile(const std::string& filename, const uint8_t* data, size_t size)
{
std::ofstream file(filename, std::ios::binary);
if (!file.is_open())
{
return;
}

file.write(reinterpret_cast<const char*>(data), size);

file.close();
}

int main()
{
for (int i = 0; i < 256; i++)
invSbox[sbox[i]] = i;

uint8_t* Input = new uint8_t[256 * 256];
// 读取16块解密数据合并
for (int i = 0; i < 16; i++)
{
readFile("Steg3_part" + std::to_string(i),Input + i * 16 * 256, 0, 0x10000 / 16);
}

cudaMemcpyToSymbol(cuda_sbox, sbox, 256, 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(cuda_tbox, tbox, 256, 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(cuda_invsbox, invSbox, 256, 0, cudaMemcpyHostToDevice);
uint8_t* InPtr{}, * OutPtr{};

cudaMalloc(&InPtr, 0x10000);
cudaMalloc(&OutPtr, 0x10000);

cudaMemcpy(InPtr, Input, 0x10000, cudaMemcpyHostToDevice);

dim3 Blocks(256, 1, 1), Threads(256, 1, 1);
Re_Layer3_part2 << <Blocks, Threads >> > (InPtr, OutPtr);
cudaDeviceSynchronize();

Re_Layer2 << <Blocks, Threads >> > (OutPtr, InPtr);
cudaDeviceSynchronize();

cudaMemcpy(Input, InPtr, 0x10000, cudaMemcpyDeviceToHost);

writeToFile("Layer2_Decrypted", Input, 0x10000);

cudaFree(InPtr);
cudaFree(OutPtr);

delete[] Input;

return 0;
}

最后得到Layer1加密后的图片,是经过动态模糊卷积后的,不可完全逆向,只能用算法进行清晰化处理。

alt text

去模糊代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
import numpy as np
from scipy import signal
from PIL import Image
import matplotlib.pyplot as plt

def deblur_image(blurred_image, motion_kernel, num_iterations=30, learning_rate=0.01):
kernel = motion_kernel.reshape(16, 16)
padded_kernel = np.zeros((256, 256))
padded_kernel[:16, :16] = kernel
kernel_fft = np.fft.fft2(padded_kernel)
blurred_fft = np.fft.fft2(blurred_image)
K = 0.01
kernel_fft_conj = np.conj(kernel_fft)
deblurred_fft = (kernel_fft_conj * blurred_fft) / (np.abs(kernel_fft)**2 + K)
deblurred = np.real(np.fft.ifft2(deblurred_fft))
deblurred = np.clip(deblurred, 0, 255)
return deblurred.astype(np.uint8)

def load_and_process_image(image_path, motion_data):
img = Image.open(image_path).convert('L')
img_array = np.array(img)
motion_kernel = np.array(motion_data)
deblurred_img = deblur_image(img_array, motion_kernel)
return deblurred_img

if __name__ == "__main__":
motion_data = np.array([
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.012483786,
0.042622309,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.012483786,
0.042622309,
0.012483786,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.012483786,
0.042622309,
0.012483786,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.012483786,
0.042622309,
0.012483786,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.012483786,
0.042622309,
0.012483786,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.012483786,
0.042622309,
0.012483786,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.012483786,
0.042622309,
0.012483786,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.012483786,
0.042622309,
0.012483786,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.012483786,
0.042622309,
0.012483786,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.012483786,
0.042622309,
0.012483786,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.012483786,
0.042622309,
0.012483786,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.012483786,
0.042622309,
0.012483786,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.012483786,
0.042622309,
0.012483786,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.012483786,
0.042622309,
0.012483786,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0055596731,
0.042622309,
0.012483786,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0055596731,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0])

image_path = r"Layer2_Decrypted.bmp"

original_img = np.array(Image.open(image_path).convert('L'))

deblurred_img = load_and_process_image(image_path, motion_data)

Image.fromarray(deblurred_img).save('deblurred_image.png')

最后得到一张可以看清flag的图片。

alt text

得到flag ACTF{DeEptCUdAR1VQVZ}