Skip to content

Commit

Permalink
VC1: merge idct8x8, coeff adjustments and put_pixels.
Browse files Browse the repository at this point in the history
Merging these functions allows merging some loops, which makes the
results (particularly after SIMD optimizations) much faster.
(cherry picked from commit f8bed30)
  • Loading branch information
rbultje authored and michaelni committed Feb 22, 2011
1 parent 90ed277 commit 6a786b1
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 40 deletions.
64 changes: 61 additions & 3 deletions libavcodec/ppc/vc1dsp_altivec.c
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ do { \

/** Do inverse transform on 8x8 block
*/
static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
static void vc1_inv_trans_8x8_altivec(DCTELEM block[64],
int sign, int rangered)
{
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
Expand All @@ -144,7 +145,9 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
const vector unsigned int vec_2 = vec_splat_u32(2);
const vector signed int vec_1s = vec_splat_s32(1);
const vector unsigned int vec_1 = vec_splat_u32(1);

const vector unsigned short rangered_shift = vec_splat_u16(1);
const vector signed short signed_bias = vec_sl(vec_splat_u16(4),
vec_splat_u16(4));

src0 = vec_ld( 0, block);
src1 = vec_ld( 16, block);
Expand Down Expand Up @@ -214,6 +217,27 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
src6 = vec_pack(sE, s6);
src7 = vec_pack(sF, s7);

if (rangered) {
if (!sign) {
vec_sub(src0, signed_bias);
vec_sub(src1, signed_bias);
vec_sub(src2, signed_bias);
vec_sub(src3, signed_bias);
vec_sub(src4, signed_bias);
vec_sub(src5, signed_bias);
vec_sub(src6, signed_bias);
vec_sub(src7, signed_bias);
}
vec_sl(src0, rangered_shift);
vec_sl(src1, rangered_shift);
vec_sl(src2, rangered_shift);
vec_sl(src3, rangered_shift);
vec_sl(src4, rangered_shift);
vec_sl(src5, rangered_shift);
vec_sl(src6, rangered_shift);
vec_sl(src7, rangered_shift);
}

vec_st(src0, 0, block);
vec_st(src1, 16, block);
vec_st(src2, 32, block);
Expand All @@ -224,6 +248,36 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
vec_st(src7,112, block);
}

static void vc1_inv_trans_8x8_add_altivec(uint8_t *dest, int stride, DCTELEM *b)
{
vc1_inv_trans_8x8_altivec(b, 0, 0);
ff_add_pixels_clamped_c(b, dest, stride);
}

static void vc1_inv_trans_8x8_put_signed_altivec(uint8_t *dest, int stride, DCTELEM *b)
{
vc1_inv_trans_8x8_altivec(b, 1, 0);
ff_put_signed_pixels_clamped_c(b, dest, stride);
}

static void vc1_inv_trans_8x8_put_signed_rangered_altivec(uint8_t *dest, int stride, DCTELEM *b)
{
vc1_inv_trans_8x8_altivec(b, 1, 1);
ff_put_signed_pixels_clamped_c(b, dest, stride);
}

static void vc1_inv_trans_8x8_put_altivec(uint8_t *dest, int stride, DCTELEM *b)
{
vc1_inv_trans_8x8_altivec(b, 0, 0);
ff_put_pixels_clamped_c(b, dest, stride);
}

static void vc1_inv_trans_8x8_put_rangered_altivec(uint8_t *dest, int stride, DCTELEM *b)
{
vc1_inv_trans_8x8_altivec(b, 0, 1);
ff_put_pixels_clamped_c(b, dest, stride);
}

/** Do inverse transform on 8x4 part of block
*/
static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block)
Expand Down Expand Up @@ -342,7 +396,11 @@ void ff_vc1dsp_init_altivec(VC1DSPContext* dsp)
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;

dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec;
dsp->vc1_inv_trans_8x8_add = vc1_inv_trans_8x8_add_altivec;
dsp->vc1_inv_trans_8x8_put_signed[0] = vc1_inv_trans_8x8_put_signed_altivec;
dsp->vc1_inv_trans_8x8_put_signed[1] = vc1_inv_trans_8x8_put_signed_rangered_altivec;
dsp->vc1_inv_trans_8x8_put[0] = vc1_inv_trans_8x8_put_altivec;
dsp->vc1_inv_trans_8x8_put[1] = vc1_inv_trans_8x8_put_rangered_altivec;
dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec;
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_altivec;
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_altivec;
Expand Down
28 changes: 27 additions & 1 deletion libavcodec/vc1.c
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,28 @@ static int vop_dquant_decoding(VC1Context *v)

static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb);

static void simple_idct_put_rangered(uint8_t *dest, int line_size, DCTELEM *block)
{
int i;
ff_simple_idct(block);
for (i = 0; i < 64; i++) block[i] = (block[i] - 64) << 1;
ff_put_pixels_clamped_c(block, dest, line_size);
}

static void simple_idct_put_signed(uint8_t *dest, int line_size, DCTELEM *block)
{
ff_simple_idct(block);
ff_put_signed_pixels_clamped_c(block, dest, line_size);
}

static void simple_idct_put_signed_rangered(uint8_t *dest, int line_size, DCTELEM *block)
{
int i;
ff_simple_idct(block);
for (i = 0; i < 64; i++) block[i] <<= 1;
ff_put_signed_pixels_clamped_c(block, dest, line_size);
}

/**
* Decode Simple/Main Profiles sequence header
* @see Figure 7-8, p16-17
Expand Down Expand Up @@ -337,7 +359,11 @@ int vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitConte
v->res_fasttx = get_bits1(gb);
if (!v->res_fasttx)
{
v->vc1dsp.vc1_inv_trans_8x8 = ff_simple_idct;
v->vc1dsp.vc1_inv_trans_8x8_add = ff_simple_idct_add;
v->vc1dsp.vc1_inv_trans_8x8_put[0] = ff_simple_idct_put;
v->vc1dsp.vc1_inv_trans_8x8_put[1] = simple_idct_put_rangered;
v->vc1dsp.vc1_inv_trans_8x8_put_signed[0] = simple_idct_put_signed;
v->vc1dsp.vc1_inv_trans_8x8_put_signed[1] = simple_idct_put_signed_rangered;
v->vc1dsp.vc1_inv_trans_8x4 = ff_simple_idct84_add;
v->vc1dsp.vc1_inv_trans_4x8 = ff_simple_idct48_add;
v->vc1dsp.vc1_inv_trans_4x4 = ff_simple_idct44_add;
Expand Down
58 changes: 33 additions & 25 deletions libavcodec/vc1dec.c
Original file line number Diff line number Diff line change
Expand Up @@ -2009,8 +2009,7 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan
if(i==1)
v->vc1dsp.vc1_inv_trans_8x8_dc(dst, linesize, block);
else{
v->vc1dsp.vc1_inv_trans_8x8(block);
s->dsp.add_pixels_clamped(block, dst, linesize);
v->vc1dsp.vc1_inv_trans_8x8_add(dst, linesize, block);
}
if(apply_filter && cbp_top & 0xC)
v->vc1dsp.vc1_v_loop_filter8(dst, linesize, v->pq);
Expand Down Expand Up @@ -2117,7 +2116,7 @@ static int vc1_decode_p_mb(VC1Context *v)
{
MpegEncContext *s = &v->s;
GetBitContext *gb = &s->gb;
int i, j;
int i;
int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
int cbp; /* cbp decoding stuff */
int mqdiff, mquant; /* MB quantization */
Expand Down Expand Up @@ -2149,6 +2148,8 @@ static int vc1_decode_p_mb(VC1Context *v)
{
if (!skipped)
{
vc1_idct_func idct8x8_fn;

GET_MVDATA(dmv_x, dmv_y);

if (s->mb_intra) {
Expand Down Expand Up @@ -2183,6 +2184,7 @@ static int vc1_decode_p_mb(VC1Context *v)
VC1_TTMB_VLC_BITS, 2);
if(!s->mb_intra) vc1_mc_1mv(v, 0);
dst_idx = 0;
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm];
for (i=0; i<6; i++)
{
s->dc_val[0][s->block_index[i]] = 0;
Expand All @@ -2200,9 +2202,9 @@ static int vc1_decode_p_mb(VC1Context *v)

vc1_decode_intra_block(v, s->block[i], i, val, mquant, (i&4)?v->codingset2:v->codingset);
if((i>3) && (s->flags & CODEC_FLAG_GRAY)) continue;
v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
if(v->rangeredfrm) for(j = 0; j < 64; j++) s->block[i][j] <<= 1;
s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
idct8x8_fn(s->dest[dst_idx] + off,
i & 4 ? s->uvlinesize : s->linesize,
s->block[i]);
if(v->pq >= 9 && v->overlap) {
if(v->c_avail)
v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
Expand Down Expand Up @@ -2267,6 +2269,7 @@ static int vc1_decode_p_mb(VC1Context *v)
{
int intra_count = 0, coded_inter = 0;
int is_intra[6], is_coded[6];
vc1_idct_func idct8x8_fn;
/* Get CBPCY */
cbp = get_vlc2(&v->s.gb, v->cbpcy_vlc->table, VC1_CBPCY_P_VLC_BITS, 2);
for (i=0; i<6; i++)
Expand Down Expand Up @@ -2316,6 +2319,7 @@ static int vc1_decode_p_mb(VC1Context *v)
}
if (!v->ttmbf && coded_inter)
ttmb = get_vlc2(gb, ff_vc1_ttmb_vlc[v->tt_index].table, VC1_TTMB_VLC_BITS, 2);
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm];
for (i=0; i<6; i++)
{
dst_idx += i >> 2;
Expand All @@ -2331,9 +2335,9 @@ static int vc1_decode_p_mb(VC1Context *v)

vc1_decode_intra_block(v, s->block[i], i, is_coded[i], mquant, (i&4)?v->codingset2:v->codingset);
if((i>3) && (s->flags & CODEC_FLAG_GRAY)) continue;
v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
if(v->rangeredfrm) for(j = 0; j < 64; j++) s->block[i][j] <<= 1;
s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, (i&4)?s->uvlinesize:s->linesize);
idct8x8_fn(s->dest[dst_idx] + off,
(i&4)?s->uvlinesize:s->linesize,
s->block[i]);
if(v->pq >= 9 && v->overlap) {
if(v->c_avail)
v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
Expand Down Expand Up @@ -2409,7 +2413,7 @@ static void vc1_decode_b_mb(VC1Context *v)
{
MpegEncContext *s = &v->s;
GetBitContext *gb = &s->gb;
int i, j;
int i;
int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
int cbp = 0; /* cbp decoding stuff */
int mqdiff, mquant; /* MB quantization */
Expand All @@ -2422,6 +2426,7 @@ static void vc1_decode_b_mb(VC1Context *v)
int skipped, direct;
int dmv_x[2], dmv_y[2];
int bmvtype = BMV_TYPE_BACKWARD;
vc1_idct_func idct8x8_fn;

mquant = v->pq; /* Loosy initialization */
s->mb_intra = 0;
Expand Down Expand Up @@ -2519,6 +2524,7 @@ static void vc1_decode_b_mb(VC1Context *v)
}
}
dst_idx = 0;
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm];
for (i=0; i<6; i++)
{
s->dc_val[0][s->block_index[i]] = 0;
Expand All @@ -2536,9 +2542,9 @@ static void vc1_decode_b_mb(VC1Context *v)

vc1_decode_intra_block(v, s->block[i], i, val, mquant, (i&4)?v->codingset2:v->codingset);
if((i>3) && (s->flags & CODEC_FLAG_GRAY)) continue;
v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
if(v->rangeredfrm) for(j = 0; j < 64; j++) s->block[i][j] <<= 1;
s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
idct8x8_fn(s->dest[dst_idx] + off,
i & 4 ? s->uvlinesize : s->linesize,
s->block[i]);
} else if(val) {
vc1_decode_p_block(v, s->block[i], i, mquant, ttmb, first_block, s->dest[dst_idx] + off, (i&4)?s->uvlinesize:s->linesize, (i&4) && (s->flags & CODEC_FLAG_GRAY), 0, 0, 0);
if(!v->ttmbf && ttmb < 8) ttmb = -1;
Expand All @@ -2551,11 +2557,12 @@ static void vc1_decode_b_mb(VC1Context *v)
*/
static void vc1_decode_i_blocks(VC1Context *v)
{
int k, j;
int k;
MpegEncContext *s = &v->s;
int cbp, val;
uint8_t *coded_val;
int mb_pos;
vc1_idct_func idct8x8_fn;

/* select codingmode used for VLC tables selection */
switch(v->y_ac_table_index){
Expand Down Expand Up @@ -2590,6 +2597,10 @@ static void vc1_decode_i_blocks(VC1Context *v)
s->mb_x = s->mb_y = 0;
s->mb_intra = 1;
s->first_slice_line = 1;
if(v->pq >= 9 && v->overlap) {
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm];
} else
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put[!!v->rangeredfrm];
for(s->mb_y = 0; s->mb_y < s->mb_height; s->mb_y++) {
s->mb_x = 0;
ff_init_block_index(s);
Expand Down Expand Up @@ -2626,14 +2637,9 @@ static void vc1_decode_i_blocks(VC1Context *v)
vc1_decode_i_block(v, s->block[k], k, val, (k<4)? v->codingset : v->codingset2);

if (k > 3 && (s->flags & CODEC_FLAG_GRAY)) continue;
v->vc1dsp.vc1_inv_trans_8x8(s->block[k]);
if(v->pq >= 9 && v->overlap) {
if (v->rangeredfrm) for(j = 0; j < 64; j++) s->block[k][j] <<= 1;
s->dsp.put_signed_pixels_clamped(s->block[k], dst[k], k & 4 ? s->uvlinesize : s->linesize);
} else {
if (v->rangeredfrm) for(j = 0; j < 64; j++) s->block[k][j] = (s->block[k][j] - 64) << 1;
s->dsp.put_pixels_clamped(s->block[k], dst[k], k & 4 ? s->uvlinesize : s->linesize);
}
idct8x8_fn(dst[k],
k & 4 ? s->uvlinesize : s->linesize,
s->block[k]);
}

if(v->pq >= 9 && v->overlap) {
Expand Down Expand Up @@ -2691,6 +2697,7 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
int mqdiff;
int overlap;
GetBitContext *gb = &s->gb;
vc1_idct_func idct8x8_fn;

/* select codingmode used for VLC tables selection */
switch(v->y_ac_table_index){
Expand Down Expand Up @@ -2721,6 +2728,7 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
s->mb_x = s->mb_y = 0;
s->mb_intra = 1;
s->first_slice_line = 1;
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[0];
for(s->mb_y = 0; s->mb_y < s->mb_height; s->mb_y++) {
s->mb_x = 0;
ff_init_block_index(s);
Expand Down Expand Up @@ -2777,9 +2785,9 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
vc1_decode_i_block_adv(v, s->block[k], k, val, (k<4)? v->codingset : v->codingset2, mquant);

if (k > 3 && (s->flags & CODEC_FLAG_GRAY)) continue;
v->vc1dsp.vc1_inv_trans_8x8(s->block[k]);
s->dsp.put_signed_pixels_clamped(s->block[k], dst[k],
k & 4 ? s->uvlinesize : s->linesize);
idct8x8_fn(dst[k],
k & 4 ? s->uvlinesize : s->linesize,
s->block[k]);
}

if(overlap) {
Expand Down
Loading

0 comments on commit 6a786b1

Please sign in to comment.