Upstream: yes From 464ca6c2f37b93180cc27ea41889ffaf1eab388e Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Thu, 25 Jun 2020 01:27:28 +0200 Subject: [PATCH] x86: Fix 32-bit build with PIC enabled --- src/x86/mc_sse.asm | 147 +++++++++++++++++---------------------------- 1 file changed, 56 insertions(+), 91 deletions(-) diff --git a/src/x86/mc_sse.asm b/src/x86/mc_sse.asm index d98ac621..5d5c5e3f 100644 --- a/src/x86/mc_sse.asm +++ b/src/x86/mc_sse.asm @@ -1263,7 +1263,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 %if ARCH_X86_64 mova m8, [pw_8] %else - %define m8 [pw_8] + %define m8 [t1-prep_sse2+pw_8] %endif pxor m7, m7 %endif @@ -1272,13 +1272,11 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 pshuflw m6, m6, q0000 %if cpuflag(ssse3) punpcklqdq m6, m6 -%else - %if ARCH_X86_64 +%elif ARCH_X86_64 psrlw m0, m8, 3 punpcklwd m6, m0 - %else +%else punpcklwd m6, [base+pw_1] - %endif %endif %if ARCH_X86_32 mov t1, t2 ; save base reg for w4 @@ -1396,8 +1394,8 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 PUSH r7 %endif mov r7, tmpq + mov r5, srcq %endif - mov t1, srcq .hv_w16_hloop: movu m0, [srcq+strideq*0+8*0] movu m1, [srcq+strideq*0+8*1] @@ -1440,14 +1438,17 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 sub hd, 2 jg .hv_w16_vloop movzx hd, t2w - add t1, 16 - mov srcq, t1 %if ARCH_X86_64 + add r5, 16 add r7, 2*16 + mov srcq, r5 mov tmpq, r7 %else + mov srcq, srcmp mov tmpq, tmpmp + add srcq, 16 add tmpq, 2*16 + mov srcmp, srcq mov tmpmp, tmpq %endif sub t2d, 1<<16 @@ -2624,22 +2625,20 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1 %if cpuflag(ssse3) phaddw %1, %2 - %else - %ifnidn %1, %2 + %elifnidn %1, %2 %if %4 == 1 - mova %3, [pw_1] + mova %3, [base+pw_1] %endif pmaddwd %1, %3 pmaddwd %2, %3 packssdw %1, %2 - %else + %else %if %4 == 1 - pmaddwd %1, [pw_1] + pmaddwd %1, [base+pw_1] %else pmaddwd %1, %3 %endif packssdw %1, %1 - %endif %endif %endmacro @@ -2795,11 +2794,9 @@ PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH %if ARCH_X86_32 %define base_reg r2 %define base base_reg-prep%+SUFFIX - %define W32_RESTORE_SSQ mov strideq, stridem %else %define base_reg r7 %define base 0 - %define W32_RESTORE_SSQ %endif cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %assign org_stack_offset stack_offset @@ -2834,6 +2831,10 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 WIN64_SPILL_XMM 12 %else WIN64_SPILL_XMM 16 +%endif +%if ARCH_X86_32 + %define strideq r6 + mov strideq, stridem %endif cmp wd, 4 je .h_w4 @@ -2894,7 +2895,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 punpcklbw m4, m4 psraw m4, 8 %endif - W32_RESTORE_SSQ %if ARCH_X86_64 lea stride3q, [strideq*3] %endif @@ -2916,8 +2916,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 pshufb m1, m5 pshufb m2, m5 pshufb m3, m5 -%else - %if ARCH_X86_64 +%elif ARCH_X86_64 movd m0, [srcq+strideq*0+0] movd m12, [srcq+strideq*0+1] movd m1, [srcq+strideq*1+0] @@ -2947,7 +2946,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 punpcklqdq m1, m5 ; 1 punpcklqdq m2, m13 ; 2 punpcklqdq m3, m7 ; 3 - %else +%else movd m0, [srcq+strideq*0+0] movd m1, [srcq+strideq*0+1] movd m2, [srcq+strideq*0+2] @@ -2978,7 +2977,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 lea srcq, [srcq+strideq*2] punpckldq m7, m5 punpcklqdq m3, m7 ; 3 - %endif %endif PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2 PMADDUBSW m1, m4, m5, m7, 0 @@ -2994,14 +2992,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 sub hd, 4 jg .h_w4_loop RET - ; .h_w8: -%if ARCH_X86_32 - mov r3, r2 - %define base_reg r3 - W32_RESTORE_SSQ -%endif -.h_w8_loop: %if cpuflag(ssse3) PREP_8TAP_H 0, srcq+strideq*0 PREP_8TAP_H 1, srcq+strideq*1 @@ -3017,51 +3008,42 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 add tmpq, 16 dec hd %endif - jg .h_w8_loop + jg .h_w8 RET .h_w16: - mov r6, -16*1 + mov r3, -16*1 jmp .h_start .h_w32: - mov r6, -16*2 + mov r3, -16*2 jmp .h_start .h_w64: - mov r6, -16*4 + mov r3, -16*4 jmp .h_start .h_w128: - mov r6, -16*8 + mov r3, -16*8 .h_start: -%if ARCH_X86_32 - mov r3, r2 - %define base_reg r3 -%endif - sub srcq, r6 - mov r5, r6 - W32_RESTORE_SSQ + sub srcq, r3 + mov r5, r3 .h_loop: %if cpuflag(ssse3) - PREP_8TAP_H 0, srcq+r6+8*0 - PREP_8TAP_H 1, srcq+r6+8*1 + PREP_8TAP_H 0, srcq+r3+8*0 + PREP_8TAP_H 1, srcq+r3+8*1 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 32 - add r6, 16 + add r3, 16 %else - PREP_8TAP_H 0, srcq+r6 + PREP_8TAP_H 0, srcq+r3 mova [tmpq], m0 add tmpq, 16 - add r6, 8 + add r3, 8 %endif jl .h_loop add srcq, strideq - mov r6, r5 + mov r3, r5 dec hd jg .h_loop RET -%if ARCH_X86_32 - %define base_reg r2 -%endif - ; .v: LEA base_reg, prep%+SUFFIX %if ARCH_X86_32 @@ -3086,7 +3068,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define subpel1 [rsp+mmsize*1] %define subpel2 [rsp+mmsize*2] %define subpel3 [rsp+mmsize*3] -%assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed +%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed %if cpuflag(ssse3) ALLOC_STACK -mmsize*4 %else @@ -3105,15 +3087,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 movd m0, [myq+6] PSHUFB_0X1X m0, m2 mova subpel3, m0 - %if notcpuflag(ssse3) - mov r6, base_reg - %define base_reg r6 - %endif - mov strideq, [rstk+stack_offset+gprsize*3] - lea strideq, [strideq*3] - sub [rstk+stack_offset+gprsize*2], strideq mov strideq, [rstk+stack_offset+gprsize*3] - mov srcq, [rstk+stack_offset+gprsize*2] + lea r5, [strideq*3] + sub srcq, r5 %else %define subpel0 m8 %define subpel1 m9 @@ -3245,10 +3221,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 jg .v_w4_loop0 %endif RET -%if ARCH_X86_32 && notcpuflag(ssse3) - %define base_reg r2 -%endif - ; %if ARCH_X86_64 .v_w8: lea r5d, [wq - 8] ; horizontal loop @@ -3373,16 +3345,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] - mov r5, r2; use as new base - %define base_reg r5 - %assign regs_used 2 + mov strideq, stridem + %assign regs_used 6 ALLOC_STACK -mmsize*14 %assign regs_used 7 - mov strideq, [rstk+stack_offset+gprsize*3] - lea strideq, [strideq*3 + 1] - sub [rstk+stack_offset+gprsize*2], strideq - mov strideq, [rstk+stack_offset+gprsize*3] - mov srcq, [rstk+stack_offset+gprsize*2] + lea r5, [strideq*3+1] + sub srcq, r5 %define subpelv0 [rsp+mmsize*0] %define subpelv1 [rsp+mmsize*1] %define subpelv2 [rsp+mmsize*2] @@ -3445,9 +3413,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define hv4_line_1_3 13 %if ARCH_X86_32 %if cpuflag(ssse3) - %define w8192reg [base+pw_8192] + %define w8192reg [base+pw_8192] %else - %define w8192reg [base+pw_2] + %define w8192reg [base+pw_2] %endif %define d32reg [base+pd_32] %else @@ -3676,7 +3644,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define hv8_line_6 4 shr mxd, 16 %if ARCH_X86_32 - %define base_reg r2 %define subpelh0 [rsp+mmsize*5] %define subpelh1 [rsp+mmsize*6] %define subpelv0 [rsp+mmsize*7] @@ -3692,16 +3659,16 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 cmp hd, 6 cmovs myd, mxd movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] - ALLOC_STACK -mmsize*13 + mov strideq, stridem + %assign regs_used 6 + ALLOC_STACK -mmsize*14 + %assign regs_used 7 %if STACK_ALIGNMENT < mmsize - mov rstk, r2m - %define tmpm [rsp+mmsize*13+gprsize*1] - %define srcm [rsp+mmsize*13+gprsize*2] - %define stridem [rsp+mmsize*13+gprsize*3] - mov stridem, rstk + %define tmpm [rsp+mmsize*13+gprsize*1] + %define srcm [rsp+mmsize*13+gprsize*2] + %define stridem [rsp+mmsize*13+gprsize*3] + mov stridem, strideq %endif - mov r6, r2 - %define base_reg r6 pshufd m0, m1, q0000 pshufd m1, m1, q1111 punpcklbw m5, m5 @@ -3724,12 +3691,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mova subpelv1, m3 mova subpelv2, m4 mova subpelv3, m5 - W32_RESTORE_SSQ - lea strided, [strided*3] - sub srcd, strided - sub srcd, 3 - mov srcm, srcd - W32_RESTORE_SSQ + lea r5, [strideq*3+3] + sub srcq, r5 + mov srcm, srcq %else ALLOC_STACK mmsize*5, 16 %define subpelh0 m10 @@ -3765,7 +3729,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %if notcpuflag(ssse3) mova m7, [base+pw_2] %endif - lea stride3q, [strideq*3] + lea stride3q, [strideq*3] sub srcq, 3 sub srcq, stride3q mov r6, srcq @@ -3939,11 +3903,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 .hv_w8_outer: movzx hd, r5w %if ARCH_X86_32 - add dword tmpm, 8 - mov tmpq, tmpm mov srcq, srcm + mov tmpq, tmpm add srcq, 4 + add tmpq, 8 mov srcm, srcq + mov tmpm, tmpq %else add r8, 8 mov tmpq, r8 -- 2.26.2