Skip to content

Commit

Permalink
VC1: merge idct8x8, coeff adjustments and put_pixels.
Browse files Browse the repository at this point in the history
Merging these functions allows merging some loops, which makes the
results (particularly after SIMD optimizations) much faster.
  • Loading branch information
rbultje committed Feb 21, 2011
1 parent 8d9ac96 commit f8bed30
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 40 deletions.
64 changes: 61 additions & 3 deletions libavcodec/ppc/vc1dsp_altivec.c
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ do { \

/** Do inverse transform on 8x8 block
*/
static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
static void vc1_inv_trans_8x8_altivec(DCTELEM block[64],
int sign, int rangered)
{
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
Expand All @@ -144,7 +145,9 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
const vector unsigned int vec_2 = vec_splat_u32(2);
const vector signed int vec_1s = vec_splat_s32(1);
const vector unsigned int vec_1 = vec_splat_u32(1);

const vector unsigned short rangered_shift = vec_splat_u16(1);
const vector signed short signed_bias = vec_sl(vec_splat_u16(4),
vec_splat_u16(4));

src0 = vec_ld( 0, block);
src1 = vec_ld( 16, block);
Expand Down Expand Up @@ -214,6 +217,27 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
src6 = vec_pack(sE, s6);
src7 = vec_pack(sF, s7);

if (rangered) {
if (!sign) {
vec_sub(src0, signed_bias);
vec_sub(src1, signed_bias);
vec_sub(src2, signed_bias);
vec_sub(src3, signed_bias);
vec_sub(src4, signed_bias);
vec_sub(src5, signed_bias);
vec_sub(src6, signed_bias);
vec_sub(src7, signed_bias);
}
vec_sl(src0, rangered_shift);
vec_sl(src1, rangered_shift);
vec_sl(src2, rangered_shift);
vec_sl(src3, rangered_shift);
vec_sl(src4, rangered_shift);
vec_sl(src5, rangered_shift);
vec_sl(src6, rangered_shift);
vec_sl(src7, rangered_shift);
}

vec_st(src0, 0, block);
vec_st(src1, 16, block);
vec_st(src2, 32, block);
Expand All @@ -224,6 +248,36 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
vec_st(src7,112, block);
}

static void vc1_inv_trans_8x8_add_altivec(uint8_t *dest, int stride, DCTELEM *b)
{
vc1_inv_trans_8x8_altivec(b, 0, 0);
ff_add_pixels_clamped_c(b, dest, stride);
}

static void vc1_inv_trans_8x8_put_signed_altivec(uint8_t *dest, int stride, DCTELEM *b)
{
vc1_inv_trans_8x8_altivec(b, 1, 0);
ff_put_signed_pixels_clamped_c(b, dest, stride);
}

static void vc1_inv_trans_8x8_put_signed_rangered_altivec(uint8_t *dest, int stride, DCTELEM *b)
{
vc1_inv_trans_8x8_altivec(b, 1, 1);
ff_put_signed_pixels_clamped_c(b, dest, stride);
}

static void vc1_inv_trans_8x8_put_altivec(uint8_t *dest, int stride, DCTELEM *b)
{
vc1_inv_trans_8x8_altivec(b, 0, 0);
ff_put_pixels_clamped_c(b, dest, stride);
}

static void vc1_inv_trans_8x8_put_rangered_altivec(uint8_t *dest, int stride, DCTELEM *b)
{
vc1_inv_trans_8x8_altivec(b, 0, 1);
ff_put_pixels_clamped_c(b, dest, stride);
}

/** Do inverse transform on 8x4 part of block
*/
static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block)
Expand Down Expand Up @@ -342,7 +396,11 @@ void ff_vc1dsp_init_altivec(VC1DSPContext* dsp)
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;

dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec;
dsp->vc1_inv_trans_8x8_add = vc1_inv_trans_8x8_add_altivec;
dsp->vc1_inv_trans_8x8_put_signed[0] = vc1_inv_trans_8x8_put_signed_altivec;
dsp->vc1_inv_trans_8x8_put_signed[1] = vc1_inv_trans_8x8_put_signed_rangered_altivec;
dsp->vc1_inv_trans_8x8_put[0] = vc1_inv_trans_8x8_put_altivec;
dsp->vc1_inv_trans_8x8_put[1] = vc1_inv_trans_8x8_put_rangered_altivec;
dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec;
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_altivec;
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_altivec;
Expand Down
28 changes: 27 additions & 1 deletion libavcodec/vc1.c
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,28 @@ static int vop_dquant_decoding(VC1Context *v)

static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb);

static void simple_idct_put_rangered(uint8_t *dest, int line_size, DCTELEM *block)
{
int i;
ff_simple_idct(block);
for (i = 0; i < 64; i++) block[i] = (block[i] - 64) << 1;
ff_put_pixels_clamped_c(block, dest, line_size);
}

static void simple_idct_put_signed(uint8_t *dest, int line_size, DCTELEM *block)
{
ff_simple_idct(block);
ff_put_signed_pixels_clamped_c(block, dest, line_size);
}

static void simple_idct_put_signed_rangered(uint8_t *dest, int line_size, DCTELEM *block)
{
int i;
ff_simple_idct(block);
for (i = 0; i < 64; i++) block[i] <<= 1;
ff_put_signed_pixels_clamped_c(block, dest, line_size);
}

/**
* Decode Simple/Main Profiles sequence header
* @see Figure 7-8, p16-17
Expand Down Expand Up @@ -337,7 +359,11 @@ int vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitConte
v->res_fasttx = get_bits1(gb);
if (!v->res_fasttx)
{
v->vc1dsp.vc1_inv_trans_8x8 = ff_simple_idct;
v->vc1dsp.vc1_inv_trans_8x8_add = ff_simple_idct_add;
v->vc1dsp.vc1_inv_trans_8x8_put[0] = ff_simple_idct_put;
v->vc1dsp.vc1_inv_trans_8x8_put[1] = simple_idct_put_rangered;
v->vc1dsp.vc1_inv_trans_8x8_put_signed[0] = simple_idct_put_signed;
v->vc1dsp.vc1_inv_trans_8x8_put_signed[1] = simple_idct_put_signed_rangered;
v->vc1dsp.vc1_inv_trans_8x4 = ff_simple_idct84_add;
v->vc1dsp.vc1_inv_trans_4x8 = ff_simple_idct48_add;
v->vc1dsp.vc1_inv_trans_4x4 = ff_simple_idct44_add;
Expand Down
58 changes: 33 additions & 25 deletions libavcodec/vc1dec.c
Original file line number Diff line number Diff line change
Expand Up @@ -2009,8 +2009,7 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan
if(i==1)
v->vc1dsp.vc1_inv_trans_8x8_dc(dst, linesize, block);
else{
v->vc1dsp.vc1_inv_trans_8x8(block);
s->dsp.add_pixels_clamped(block, dst, linesize);
v->vc1dsp.vc1_inv_trans_8x8_add(dst, linesize, block);
}
if(apply_filter && cbp_top & 0xC)
v->vc1dsp.vc1_v_loop_filter8(dst, linesize, v->pq);
Expand Down Expand Up @@ -2117,7 +2116,7 @@ static int vc1_decode_p_mb(VC1Context *v)
{
MpegEncContext *s = &v->s;
GetBitContext *gb = &s->gb;
int i, j;
int i;
int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
int cbp; /* cbp decoding stuff */
int mqdiff, mquant; /* MB quantization */
Expand Down Expand Up @@ -2149,6 +2148,8 @@ static int vc1_decode_p_mb(VC1Context *v)
{
if (!skipped)
{
vc1_idct_func idct8x8_fn;

GET_MVDATA(dmv_x, dmv_y);

if (s->mb_intra) {
Expand Down Expand Up @@ -2183,6 +2184,7 @@ static int vc1_decode_p_mb(VC1Context *v)
VC1_TTMB_VLC_BITS, 2);
if(!s->mb_intra) vc1_mc_1mv(v, 0);
dst_idx = 0;
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm];
for (i=0; i<6; i++)
{
s->dc_val[0][s->block_index[i]] = 0;
Expand All @@ -2200,9 +2202,9 @@ static int vc1_decode_p_mb(VC1Context *v)

vc1_decode_intra_block(v, s->block[i], i, val, mquant, (i&4)?v->codingset2:v->codingset);
if((i>3) && (s->flags & CODEC_FLAG_GRAY)) continue;
v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
if(v->rangeredfrm) for(j = 0; j < 64; j++) s->block[i][j] <<= 1;
s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
idct8x8_fn(s->dest[dst_idx] + off,
i & 4 ? s->uvlinesize : s->linesize,
s->block[i]);
if(v->pq >= 9 && v->overlap) {
if(v->c_avail)
v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
Expand Down Expand Up @@ -2267,6 +2269,7 @@ static int vc1_decode_p_mb(VC1Context *v)
{
int intra_count = 0, coded_inter = 0;
int is_intra[6], is_coded[6];
vc1_idct_func idct8x8_fn;
/* Get CBPCY */
cbp = get_vlc2(&v->s.gb, v->cbpcy_vlc->table, VC1_CBPCY_P_VLC_BITS, 2);
for (i=0; i<6; i++)
Expand Down Expand Up @@ -2316,6 +2319,7 @@ static int vc1_decode_p_mb(VC1Context *v)
}
if (!v->ttmbf && coded_inter)
ttmb = get_vlc2(gb, ff_vc1_ttmb_vlc[v->tt_index].table, VC1_TTMB_VLC_BITS, 2);
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm];
for (i=0; i<6; i++)
{
dst_idx += i >> 2;
Expand All @@ -2331,9 +2335,9 @@ static int vc1_decode_p_mb(VC1Context *v)

vc1_decode_intra_block(v, s->block[i], i, is_coded[i], mquant, (i&4)?v->codingset2:v->codingset);
if((i>3) && (s->flags & CODEC_FLAG_GRAY)) continue;
v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
if(v->rangeredfrm) for(j = 0; j < 64; j++) s->block[i][j] <<= 1;
s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, (i&4)?s->uvlinesize:s->linesize);
idct8x8_fn(s->dest[dst_idx] + off,
(i&4)?s->uvlinesize:s->linesize,
s->block[i]);
if(v->pq >= 9 && v->overlap) {
if(v->c_avail)
v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
Expand Down Expand Up @@ -2409,7 +2413,7 @@ static void vc1_decode_b_mb(VC1Context *v)
{
MpegEncContext *s = &v->s;
GetBitContext *gb = &s->gb;
int i, j;
int i;
int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
int cbp = 0; /* cbp decoding stuff */
int mqdiff, mquant; /* MB quantization */
Expand All @@ -2422,6 +2426,7 @@ static void vc1_decode_b_mb(VC1Context *v)
int skipped, direct;
int dmv_x[2], dmv_y[2];
int bmvtype = BMV_TYPE_BACKWARD;
vc1_idct_func idct8x8_fn;

mquant = v->pq; /* Loosy initialization */
s->mb_intra = 0;
Expand Down Expand Up @@ -2519,6 +2524,7 @@ static void vc1_decode_b_mb(VC1Context *v)
}
}
dst_idx = 0;
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm];
for (i=0; i<6; i++)
{
s->dc_val[0][s->block_index[i]] = 0;
Expand All @@ -2536,9 +2542,9 @@ static void vc1_decode_b_mb(VC1Context *v)

vc1_decode_intra_block(v, s->block[i], i, val, mquant, (i&4)?v->codingset2:v->codingset);
if((i>3) && (s->flags & CODEC_FLAG_GRAY)) continue;
v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
if(v->rangeredfrm) for(j = 0; j < 64; j++) s->block[i][j] <<= 1;
s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
idct8x8_fn(s->dest[dst_idx] + off,
i & 4 ? s->uvlinesize : s->linesize,
s->block[i]);
} else if(val) {
vc1_decode_p_block(v, s->block[i], i, mquant, ttmb, first_block, s->dest[dst_idx] + off, (i&4)?s->uvlinesize:s->linesize, (i&4) && (s->flags & CODEC_FLAG_GRAY), 0, 0, 0);
if(!v->ttmbf && ttmb < 8) ttmb = -1;
Expand All @@ -2551,11 +2557,12 @@ static void vc1_decode_b_mb(VC1Context *v)
*/
static void vc1_decode_i_blocks(VC1Context *v)
{
int k, j;
int k;
MpegEncContext *s = &v->s;
int cbp, val;
uint8_t *coded_val;
int mb_pos;
vc1_idct_func idct8x8_fn;

/* select codingmode used for VLC tables selection */
switch(v->y_ac_table_index){
Expand Down Expand Up @@ -2590,6 +2597,10 @@ static void vc1_decode_i_blocks(VC1Context *v)
s->mb_x = s->mb_y = 0;
s->mb_intra = 1;
s->first_slice_line = 1;
if(v->pq >= 9 && v->overlap) {
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[!!v->rangeredfrm];
} else
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put[!!v->rangeredfrm];
for(s->mb_y = 0; s->mb_y < s->mb_height; s->mb_y++) {
s->mb_x = 0;
ff_init_block_index(s);
Expand Down Expand Up @@ -2626,14 +2637,9 @@ static void vc1_decode_i_blocks(VC1Context *v)
vc1_decode_i_block(v, s->block[k], k, val, (k<4)? v->codingset : v->codingset2);

if (k > 3 && (s->flags & CODEC_FLAG_GRAY)) continue;
v->vc1dsp.vc1_inv_trans_8x8(s->block[k]);
if(v->pq >= 9 && v->overlap) {
if (v->rangeredfrm) for(j = 0; j < 64; j++) s->block[k][j] <<= 1;
s->dsp.put_signed_pixels_clamped(s->block[k], dst[k], k & 4 ? s->uvlinesize : s->linesize);
} else {
if (v->rangeredfrm) for(j = 0; j < 64; j++) s->block[k][j] = (s->block[k][j] - 64) << 1;
s->dsp.put_pixels_clamped(s->block[k], dst[k], k & 4 ? s->uvlinesize : s->linesize);
}
idct8x8_fn(dst[k],
k & 4 ? s->uvlinesize : s->linesize,
s->block[k]);
}

if(v->pq >= 9 && v->overlap) {
Expand Down Expand Up @@ -2691,6 +2697,7 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
int mqdiff;
int overlap;
GetBitContext *gb = &s->gb;
vc1_idct_func idct8x8_fn;

/* select codingmode used for VLC tables selection */
switch(v->y_ac_table_index){
Expand Down Expand Up @@ -2721,6 +2728,7 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
s->mb_x = s->mb_y = 0;
s->mb_intra = 1;
s->first_slice_line = 1;
idct8x8_fn = v->vc1dsp.vc1_inv_trans_8x8_put_signed[0];
for(s->mb_y = 0; s->mb_y < s->mb_height; s->mb_y++) {
s->mb_x = 0;
ff_init_block_index(s);
Expand Down Expand Up @@ -2777,9 +2785,9 @@ static void vc1_decode_i_blocks_adv(VC1Context *v)
vc1_decode_i_block_adv(v, s->block[k], k, val, (k<4)? v->codingset : v->codingset2, mquant);

if (k > 3 && (s->flags & CODEC_FLAG_GRAY)) continue;
v->vc1dsp.vc1_inv_trans_8x8(s->block[k]);
s->dsp.put_signed_pixels_clamped(s->block[k], dst[k],
k & 4 ? s->uvlinesize : s->linesize);
idct8x8_fn(dst[k],
k & 4 ? s->uvlinesize : s->linesize,
s->block[k]);
}

if(overlap) {
Expand Down
Loading

0 comments on commit f8bed30

Please sign in to comment.