Skip to content

Commit

Permalink
Merge remote-tracking branch 'qatar/master'
Browse files Browse the repository at this point in the history
* qatar/master:
  x86: fix build with nasm 2.08
  x86: use nop cpu directives only if supported
  x86: fix rNmp macros with nasm
  build: add trailing / to yasm/nasm -I flags
  x86: use 32-bit source registers with movd instruction
  x86: add colons after labels

Conflicts:
	Makefile
	libavutil/x86/x86inc.asm

Merged-by: Michael Niedermayer <[email protected]>
  • Loading branch information
michaelni committed Aug 7, 2012
2 parents 2da5a5c + edd8226 commit 2fc7c81
Show file tree
Hide file tree
Showing 21 changed files with 135 additions and 151 deletions.
3 changes: 2 additions & 1 deletion common.mak
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ CFLAGS += $(ECFLAGS)
CCFLAGS = $(CPPFLAGS) $(CFLAGS)
ASFLAGS := $(CPPFLAGS) $(ASFLAGS)
CXXFLAGS += $(CPPFLAGS) $(CFLAGS)
YASMFLAGS += $(IFLAGS) -I$(SRC_PATH)/libavutil/x86/ -Pconfig.asm
YASMFLAGS += $(IFLAGS:%=%/) -I$(SRC_PATH)/libavutil/x86/ -Pconfig.asm

HOSTCCFLAGS = $(IFLAGS) $(HOSTCFLAGS)
LDFLAGS := $(ALLFFLIBS:%=-Llib%) $(LDFLAGS)

Expand Down
2 changes: 2 additions & 0 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -1220,6 +1220,7 @@ HAVE_LIST="
closesocket
cmov
cpuid
cpunop
dcbzl
dev_bktr_ioctl_bt848_h
dev_bktr_ioctl_meteor_h
Expand Down Expand Up @@ -3229,6 +3230,7 @@ EOF
die "yasm not found, use --disable-yasm for a crippled build"
check_yasm "vextractf128 xmm0, ymm0, 0" || disable avx
check_yasm "vfmaddps ymm0, ymm1, ymm2, ymm3" || disable fma4
check_yasm "CPU amdnop" && enable cpunop
fi

case "$cpu" in
Expand Down
2 changes: 1 addition & 1 deletion libavcodec/x86/deinterlace.asm
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ cglobal deinterlace_line_mmx, 7,7,7, dst, lum_m4, lum_m3, lum_m2, lum_m1
%endif
pxor mm7, mm7
movq mm6, [pw_4]
.nextrow
.nextrow:
movd mm0, [lum_m4q]
movd mm1, [lum_m3q]
movd mm2, [lum_m2q]
Expand Down
6 changes: 3 additions & 3 deletions libavcodec/x86/dsputil_yasm.asm
Original file line number Diff line number Diff line change
Expand Up @@ -1143,7 +1143,7 @@ VECTOR_CLIP_INT32 6, 1, 0, 0
cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
lea lenq, [lend*4 - 2*mmsize]
ALIGN 16
.loop
.loop:
%if cpuflag(avx)
vmovaps xmm0, [src1q + 16]
vinsertf128 m0, m0, [src1q], 1
Expand Down Expand Up @@ -1182,7 +1182,7 @@ VECTOR_FMUL_REVERSE
cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
lea lenq, [lend*4 - 2*mmsize]
ALIGN 16
.loop
.loop:
mova m0, [src0q + lenq]
mova m1, [src0q + lenq + mmsize]
mulps m0, m0, [src1q + lenq]
Expand Down Expand Up @@ -1313,7 +1313,7 @@ cglobal bswap32_buf, 3,4,5
add r0, 4
dec r2
jnz .loop2
.end
.end:
RET

; %1 = aligned/unaligned
Expand Down
4 changes: 2 additions & 2 deletions libavcodec/x86/dsputilenc_yasm.asm
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ cglobal hadamard8_diff16_%1, 5, 6, %2
call hadamard8x8_diff_%1
add r5d, eax

.done
.done:
mov eax, r5d
%ifndef m8
ADD rsp, pad
Expand Down Expand Up @@ -288,7 +288,7 @@ cglobal sse16_sse2, 5, 5, 8
pxor m0, m0 ; mm0 = 0
pxor m7, m7 ; mm7 holds the sum

.next2lines ; FIXME why are these unaligned movs? pix1[] is aligned
.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
movu m1, [r1 ] ; mm1 = pix1[0][0-15]
movu m2, [r2 ] ; mm2 = pix2[0][0-15]
movu m3, [r1+r3] ; mm3 = pix1[1][0-15]
Expand Down
2 changes: 1 addition & 1 deletion libavcodec/x86/fft_mmx.asm
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,7 @@ cglobal fft_calc, 2,5,8
add rcx, 3
shl r2, cl
sub r4, r2
.loop
.loop:
%if mmsize == 8
PSWAPD m0, [r4 + r2 + 4]
mova [r4 + r2 + 4], m0
Expand Down
2 changes: 1 addition & 1 deletion libavcodec/x86/fmtconvert.asm
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1
mov src1q, [srcq+gprsize]
mov srcq, [srcq ]
sub src1q, srcq
.loop
.loop:
MOVPS m0, [srcq ]
MOVPS m1, [srcq+src1q ]
MOVPS m3, [srcq +mmsize]
Expand Down
30 changes: 15 additions & 15 deletions libavcodec/x86/h264_chromamc.asm
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ SECTION .text

%macro mv0_pixels_mc8 0
lea r4, [r2*2 ]
.next4rows
.next4rows:
movq mm0, [r1 ]
movq mm1, [r1+r2]
add r1, r4
Expand Down Expand Up @@ -117,7 +117,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0
mv0_pixels_mc8
REP_RET

.at_least_one_non_zero
.at_least_one_non_zero:
%ifidn %2, rv40
%if ARCH_X86_64
mov r7, r5
Expand Down Expand Up @@ -145,7 +145,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0
test r4d, r4d
mov r6, r2 ; dxy = x ? 1 : stride
jne .both_non_zero
.my_is_zero
.my_is_zero:
; mx == 0 XOR my == 0 - 1 dimensional filter only
or r4d, r5d ; x + y

Expand All @@ -166,7 +166,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0
pxor m7, m7
psubw m4, m5 ; mm4 = A = 8-x

.next1drow
.next1drow:
movq m0, [r1 ] ; mm0 = src[0..7]
movq m2, [r1+r6] ; mm1 = src[1..8]

Expand Down Expand Up @@ -197,7 +197,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0
jne .next1drow
REP_RET

.both_non_zero ; general case, bilinear
.both_non_zero: ; general case, bilinear
movd m4, r4d ; x
movd m6, r5d ; y
%ifidn %2, rv40
Expand Down Expand Up @@ -232,7 +232,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0

movq m0, [r1 ] ; mm0 = src[0..7]
movq m1, [r1+1] ; mm1 = src[1..8]
.next2drow
.next2drow:
add r1, r2

movq m2, m0
Expand Down Expand Up @@ -330,7 +330,7 @@ cglobal %1_%2_chroma_mc4_%3, 6, 6 + extra_regs, 0
pmullw m6, m2
paddw m6, m0

.next2rows
.next2rows:
movd m0, [r1 ]
movd m1, [r1+1]
add r1, r2
Expand Down Expand Up @@ -397,7 +397,7 @@ cglobal %1_%2_chroma_mc2_%3, 6, 7, 0
punpcklbw m2, m7
pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]

.nextrow
.nextrow:
add r1, r2
movq m1, m2
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
Expand Down Expand Up @@ -474,7 +474,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
mv0_pixels_mc8
REP_RET

.at_least_one_non_zero
.at_least_one_non_zero:
test r5d, r5d
je .my_is_zero
test r4d, r4d
Expand All @@ -501,7 +501,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
movlhps m7, m7
movlhps m6, m6

.next2rows
.next2rows:
movq m1, [r1+r2*1 ]
movq m2, [r1+r2*1+1]
movq m3, [r1+r2*2 ]
Expand Down Expand Up @@ -535,7 +535,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
jg .next2rows
REP_RET

.my_is_zero
.my_is_zero:
mov r5d, r4d
shl r4d, 8
add r4, 8
Expand All @@ -545,7 +545,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
pshuflw m7, m7, 0
movlhps m7, m7

.next2xrows
.next2xrows:
movq m0, [r1 ]
movq m1, [r1 +1]
movq m2, [r1+r2 ]
Expand All @@ -572,7 +572,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
jg .next2xrows
REP_RET

.mx_is_zero
.mx_is_zero:
mov r4d, r5d
shl r5d, 8
add r5, 8
Expand All @@ -582,7 +582,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
pshuflw m7, m7, 0
movlhps m7, m7

.next2yrows
.next2yrows:
movq m0, [r1 ]
movq m1, [r1+r2 ]
movdqa m2, m1
Expand Down Expand Up @@ -632,7 +632,7 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0
punpcklbw m0, [r1+1]
pshufw m6, m6, 0

.next2rows
.next2rows:
movd m1, [r1+r2*1 ]
movd m3, [r1+r2*2 ]
punpcklbw m1, [r1+r2*1+1]
Expand Down
16 changes: 8 additions & 8 deletions libavcodec/x86/h264_chromamc_10bit.asm
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ SECTION .text
%macro MV0_PIXELS_MC8 0
lea r4, [r2*3 ]
lea r5, [r2*4 ]
.next4rows
.next4rows:
movu m0, [r1 ]
movu m1, [r1+r2 ]
CHROMAMC_AVG m0, [r0 ]
Expand Down Expand Up @@ -72,14 +72,14 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8
MV0_PIXELS_MC8
REP_RET

.at_least_one_non_zero
.at_least_one_non_zero:
mov r6d, 2
test r5d, r5d
je .x_interpolation
mov r6, r2 ; dxy = x ? 1 : stride
test r4d, r4d
jne .xy_interpolation
.x_interpolation
.x_interpolation:
; mx == 0 XOR my == 0 - 1 dimensional filter only
or r4d, r5d ; x + y
movd m5, r4d
Expand All @@ -88,7 +88,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8
SPLATW m5, m5 ; mm5 = B = x
psubw m4, m5 ; mm4 = A = 8-x

.next1drow
.next1drow:
movu m0, [r1 ] ; mm0 = src[0..7]
movu m2, [r1+r6] ; mm2 = src[1..8]

Expand All @@ -107,7 +107,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8
jne .next1drow
REP_RET

.xy_interpolation ; general case, bilinear
.xy_interpolation: ; general case, bilinear
movd m4, r4m ; x
movd m6, r5m ; y

Expand All @@ -125,7 +125,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8

movu m0, [r1 ] ; mm0 = src[0..7]
movu m1, [r1+2] ; mm1 = src[1..8]
.next2drow
.next2drow:
add r1, r2

pmullw m2, m0, m4
Expand Down Expand Up @@ -192,7 +192,7 @@ cglobal %1_h264_chroma_mc4_10, 6,6,7
pmullw m6, m2
paddw m6, m0

.next2rows
.next2rows:
MC4_OP m0, m6
MC4_OP m6, m0
sub r3d, 2
Expand Down Expand Up @@ -221,7 +221,7 @@ cglobal %1_h264_chroma_mc2_10, 6,7
pxor m7, m7
pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2]

.nextrow
.nextrow:
add r1, r2
movq m1, m2
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
Expand Down
4 changes: 2 additions & 2 deletions libavcodec/x86/h264_deblock_10bit.asm
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,7 @@ cglobal deblock_v_luma_intra_10, 4,7,16
shl r2d, 2
shl r3d, 2
LOAD_AB aa, bb, r2d, r3d
.loop
.loop:
mova p2, [r4+r1]
mova p1, [r4+2*r1]
mova p0, [r4+r5]
Expand Down Expand Up @@ -676,7 +676,7 @@ cglobal deblock_h_luma_intra_10, 4,7,16
mova m0, [pw_2]
shl r2d, 2
shl r3d, 2
.loop
.loop:
movu q3, [r0-8]
movu q2, [r0+r1-8]
movu q1, [r0+r1*2-8]
Expand Down
Loading

0 comments on commit 2fc7c81

Please sign in to comment.