-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspongent.asm
191 lines (167 loc) · 5.45 KB
/
spongent.asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
/*
* spongent.asm
*
* Created: 20-2-2014 16:01:24
* Author: Wouter de Groot and Erik Schneider
*/
/*
Need to r/w:
state_in RAM permute Y
state_out RAM permute X
input FLASH absorb Z
output RAM squeeze X
sbox FLASH permute Z
*/
.def ZERO = r1
.def WHICHBUF = r2
.def TEMP2 = r3
.def LFSR = r16
.def RLFSR = r17
.def CBYTE = r18
.def BITCTR = r19
.def PJAY = r20
.def OFFSET = r21
.def INPUTCTR = r22
.def OUTPUTCTR = r23
.def CHANGER = r24
.def TEMP1 = r25
.equ HASH_SIZE = $10 ; For spongent128/128/8 the output is $10 bytes
.equ STATE_SIZE = $88 ; Internal state is hash+rate*8 bits
.equ INPUT_SIZE = $9 ; For this assignment the input is hardcoded
.equ LFSR_INIT = $7d ; Precursor of $7a; we run the step first so first iteration uses $7a.
.dseg
.org $60
state1: .byte STATE_SIZE/8
.org $80
state2: .byte STATE_SIZE/8
.org $a0
output: .byte HASH_SIZE
.cseg
.org $0200
sbox:
.db $ed, $b0, $21, $4f, $7a, $85, $9c, $36
.org $0300
input:
.db $53, $70, $6f, $6e, $67, $65, $6e, $74, $80, $00 ; "Spongent" + padding + alignment
.org $0
; EXECUTION STARTS HERE
init:
ldi CHANGER, $e0 ; This switches between $60 and $80 (cannot EOR with immediate)
ldi YL, state1 ; Contract: YL always points to the fresh read/writable state
mov WHICHBUF, YL
ldi OUTPUTCTR, HASH_SIZE ; Keep a counter of hash output bytes processed
ldi INPUTCTR, INPUT_SIZE ; Keep a counter of input bytes processed
absorb:
ldi ZH, high(2*input)
ldi ZL, low(2*input+INPUT_SIZE) ; Point to the end of input
sub ZL, INPUTCTR ; Subtract counter. We now go through bytes incrementally
lpm TEMP1, Z
ldd TEMP2, Y+STATE_SIZE/8-1
eor TEMP2, TEMP1
std Y+STATE_SIZE/8-1, TEMP2
rcall permute
dec INPUTCTR
brne absorb
squeeze:
ldi XL, output+HASH_SIZE ; Point to the end of output
sub XL, OUTPUTCTR ; Subtract reverse counter. We now go through bytes incrementally
ldd TEMP1, Y+HASH_SIZE
st X, TEMP1
rcall permute ; This runs once too many, but it makes the jumping logic smaller.
dec OUTPUTCTR
brne squeeze
done:
rjmp done ; Park the uC here. Hash output starts at RAM $a0
/*
* A full permutation function is performed here. All 70 rounds on all state bits.
* Note that this function either reads from state1 and writes to state2 or swaps the buffers.
* This is an unfortunate RAM increase, but the pLayer forces us to write to new memory,
* or we'd overwrite bits we hadn't read yet.
*/
permute:
ldi ZH, high(2*sbox) ; Some setup first
ldi ZL, low(2*sbox)
ldi LFSR, LFSR_INIT
permute_round:
lfsr_step:
lsl LFSR ;MSB is always zero
rol LFSR ;Now we can examine whether N ^ C is set (i.e. whether V is)
brvc lfsr_step_zero
sbr LFSR, 2 ;still rotated, so bit 0 is in place 1.
lfsr_step_zero:
lsr LFSR ;Step complete, we undo the extra shift left (and don't care about MSB)
ldi RLFSR, $80 ;signal bit, when ror puts it into C we know we're done.
push LFSR ;we rotate r16 into r17 so we want to be able to get the value back
lfsr_step_rotate:
rol LFSR
ror RLFSR
brcc lfsr_step_rotate
pop LFSR
eor WHICHBUF, CHANGER ; each round goes through entire state, so we must change buffers
ldi PJAY, $0 ; and so also reset bit iterator
ldd CBYTE, Y+STATE_SIZE/8-1 ; First we add LFSR values. We use the 'old' value and update after
eor CBYTE, LFSR
std Y+STATE_SIZE/8-1, CBYTE
ld CBYTE, Y
eor CBYTE, RLFSR
st Y, CBYTE
process_byte: ; Go through both sbox and pLayer for every byte
ld CBYTE, Y
st Y+, ZERO ; Guarantee this byte is ready for reading in next round
rcall sBoxByte ; Use Kostas' subroutine to substitute CBYTE
ldi BITCTR, $7 ; pLayer needs to run on all 8 bits of CBYTE
pLayer:
; first find target position
mov OFFSET, PJAY
cbr OFFSET, $f8 ; lowest 3 bits are the offset within target byte
mov XL, PJAY
lsr XL ; top 5 bits represent the target byte
lsr XL
lsr XL
add XL, WHICHBUF
mov TEMP2, ZERO
lsl CBYTE ; put target bit in C
pLayer_position_bit:
ror TEMP2 ; Get target bit into tmp from C
dec OFFSET
brge pLayer_position_bit ; branch so long as OFFSET hasn't overflown (i.e. when MSB=N=0)
ld TEMP1, X ; on ATMega, these 3 could be LAS Z, TEMP2
or TEMP1, TEMP2
st X, TEMP1
cpi PJAY, STATE_SIZE-1 ; If it's precisely state-1 then we've just processed the last bit
breq pLayer_complete
; iterative modulus. Since j+=1, simply add b/4.
;Since PJAY can never grow larger than b+b/4 simply subtracting b is sufficient as modulus operation
subi PJAY, -STATE_SIZE/4 ; b/4 is 136/4 is $22. No add with immediate, so sub with negative value
cpi PJAY, STATE_SIZE ; Test b instead of b-1, this way 135 stays 135 for final bit. Actual sub is unchanged
brmi pLayer_no_mod
subi PJAY, STATE_SIZE-1
pLayer_no_mod:
dec BITCTR
brge pLayer ; Not all bits have been done.
rjmp process_byte ; All bits processed. Move on to new byte.
pLayer_complete:
mov YL, WHICHBUF
cpi LFSR, $3f ; The 7 LFSR bits are all 1, 70 rounds have passed, time to quit.
brne permute_round
ret ; Permute complete
; This subroutine is lifted from Kostas' presentation on PRESENT.
; Two dead/useless instructions were removed.
; Please consult our documentation to see us explain it in our own words.
sBoxByte:
rcall sBoxLowNibbleAndSwap ; apply s-box to low nibble and swap nibbles
; after return, do it again.
sBoxLowNibbleAndSwap:
mov ZL, CBYTE
cbr ZL, $f0
asr ZL
lpm TEMP1, Z
brcs odd_unpack
even_unpack:
swap TEMP1
odd_unpack:
cbr TEMP1, $f0
cbr CBYTE, $f
or CBYTE, TEMP1
swap CBYTE
ret