/* * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* * FILENAME: mlib_v_ImageChannelInsert_34.c * * FUNCTIONS * mlib_v_ImageChannelInsert_U8_34R_A8D1X8 * mlib_v_ImageChannelInsert_U8_34R_A8D2X8 * mlib_v_ImageChannelInsert_U8_34R_D1 * mlib_v_ImageChannelInsert_U8_34R * mlib_v_ImageChannelInsert_S16_34R_A8D1X4 * mlib_v_ImageChannelInsert_S16_34R_A8D2X4 * mlib_v_ImageChannelInsert_S16_34R_D1 * mlib_v_ImageChannelInsert_S16_34R * mlib_v_ImageChannelInsert_U8_34L_A8D1X8 * mlib_v_ImageChannelInsert_U8_34L_A8D2X8 * mlib_v_ImageChannelInsert_U8_34L_D1 * mlib_v_ImageChannelInsert_U8_34L * mlib_v_ImageChannelInsert_S16_34L_A8D1X4 * mlib_v_ImageChannelInsert_S16_34L_A8D2X4 * mlib_v_ImageChannelInsert_S16_34L_D1 * mlib_v_ImageChannelInsert_S16_34L * * SYNOPSIS * * ARGUMENT * src pointer to source image data * dst pointer to destination image data * slb source image line stride in bytes * dlb destination image line stride in bytes * dsize image data size in pixels * xsize image width in pixels * ysize image height in lines * cmask channel mask * * DESCRIPTION * Insert a 3-channel image into the right or left 3 channels of * a 4-channel image low level functions. * * BGR => ABGR (34R), or RGB => RGBA (34L) * * NOTE * These functions are separated from mlib_v_ImageChannelInsert.c * for loop unrolling and structure clarity. */ #include #include "vis_proto.h" #include "mlib_image.h" /***************************************************************/ #define INSERT_U8_34R \ sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \ sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \ sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \ sdd = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \ sde = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdc)); \ sdf = vis_fpmerge(vis_read_hi(sdb), vis_read_lo(sdc)); \ sdg = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sde)); \ sdh = vis_fpmerge(vis_read_lo(sdd), vis_read_hi(sdf)); \ sdi = vis_fpmerge(vis_read_hi(sde), vis_read_lo(sdf)); \ sdj = vis_fpmerge(vis_read_hi(sdg), vis_read_hi(sdi)); \ sdk = vis_fpmerge(vis_read_lo(sdg), vis_read_lo(sdi)); \ sdl = vis_fpmerge(vis_read_hi(sdh), vis_read_hi(sdh)); \ sdm = vis_fpmerge(vis_read_lo(sdh), vis_read_lo(sdh)); \ dd0 = vis_fpmerge(vis_read_hi(sdl), vis_read_hi(sdj)); \ dd1 = vis_fpmerge(vis_read_lo(sdl), vis_read_lo(sdj)); \ dd2 = vis_fpmerge(vis_read_hi(sdm), vis_read_hi(sdk)); \ dd3 = vis_fpmerge(vis_read_lo(sdm), vis_read_lo(sdk)); /***************************************************************/ #define LOAD_INSERT_STORE_U8_34R_A8 \ sd0 = *sp++; /* b0g0r0b1g1r1b2g2 */ \ sd1 = *sp++; /* r2b3g3r3b4g4r4b5 */ \ sd2 = *sp++; /* g5r5b6g6r6b7g7r7 */ \ INSERT_U8_34R \ vis_pst_8(dd0, dp++, bmask); \ vis_pst_8(dd1, dp++, bmask); \ vis_pst_8(dd2, dp++, bmask); \ vis_pst_8(dd3, dp++, bmask); /***************************************************************/ #define LOAD_INSERT_U8_34R \ vis_alignaddr((void *)soff, 0); \ s0 = s3; \ s1 = sp[1]; \ s2 = sp[2]; \ s3 = sp[3]; \ sd0 = vis_faligndata(s0, s1); \ sd1 = vis_faligndata(s1, s2); \ sd2 = vis_faligndata(s2, s3); \ sp += 3; \ dd4 = dd3; \ INSERT_U8_34R /***************************************************************/ /* * Both source and destination image data are 1-d vectors and * 8-byte aligned. And dsize is multiple of 8. */ void mlib_v_ImageChannelInsert_U8_34R_A8D1X8(mlib_u8 *src, mlib_u8 *dst, mlib_s32 dsize) { mlib_d64 *sp, *dp; mlib_d64 sd0, sd1, sd2; /* source data */ mlib_d64 dd0, dd1, dd2, dd3; /* dst data */ mlib_d64 sda, sdb, sdc, sdd; /* intermediate variables */ mlib_d64 sde, sdf, sdg, sdh; mlib_d64 sdi, sdj, sdk, sdl; mlib_d64 sdm; int bmask = 0x77; int i; sp = (mlib_d64 *)src; dp = (mlib_d64 *)dst; #pragma pipeloop(0) for (i = 0; i < dsize / 8; i++) { LOAD_INSERT_STORE_U8_34R_A8; } } /***************************************************************/ /* * Either source or destination image data are not 1-d vectors, but * they are 8-byte aligned. And slb and dlb are multiple of 8. * The xsize is multiple of 8. */ void mlib_v_ImageChannelInsert_U8_34R_A8D2X8(mlib_u8 *src, mlib_s32 slb, mlib_u8 *dst, mlib_s32 dlb, mlib_s32 xsize, mlib_s32 ysize) { mlib_d64 *sp, *dp; /* 8-byte aligned pointer for pixel */ mlib_d64 *sl, *dl; /* 8-byte aligned pointer for line */ mlib_d64 sd0, sd1, sd2; /* source data */ mlib_d64 dd0, dd1, dd2, dd3; /* dst data */ mlib_d64 sda, sdb, sdc, sdd; /* intermediate variables */ mlib_d64 sde, sdf, sdg, sdh; mlib_d64 sdi, sdj, sdk, sdl; mlib_d64 sdm; int bmask = 0x77; int i, j; /* indices for x, y */ sp = sl = (mlib_d64 *)src; dp = dl = (mlib_d64 *)dst; /* row loop */ for (j = 0; j < ysize; j++) { /* 8-byte column loop */ #pragma pipeloop(0) for (i = 0; i < xsize / 8; i++) { LOAD_INSERT_STORE_U8_34R_A8; } sp = sl = (mlib_d64 *)((mlib_u8 *)sl + slb); dp = dl = (mlib_d64 *)((mlib_u8 *)dl + dlb); } } /***************************************************************/ /* * either source or destination data are not 8-byte aligned. */ void mlib_v_ImageChannelInsert_U8_34R_D1(mlib_u8 *src, mlib_u8 *dst, mlib_s32 dsize) { mlib_u8 *sa, *da; mlib_u8 *dend, *dend2; /* end points in dst */ mlib_d64 *dp; /* 8-byte aligned start points in dst */ mlib_d64 *sp; /* 8-byte aligned start point in src */ mlib_d64 s0, s1, s2, s3; /* 8-byte source raw data */ mlib_d64 sd0, sd1, sd2; /* 8-byte source data */ mlib_d64 dd0, dd1, dd2, dd3; /* dst data */ mlib_d64 dd4; /* the last datum of the last step */ mlib_d64 sda, sdb, sdc, sdd; /* intermediate variables */ mlib_d64 sde, sdf, sdg, sdh; mlib_d64 sdi, sdj, sdk, sdl; mlib_d64 sdm; int soff; /* offset of address in src */ int doff; /* offset of address in dst */ int emask; /* edge mask */ int bmask; /* channel mask */ int i, n; sa = src; da = dst; /* prepare the source address */ sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); soff = ((mlib_addr) sa & 7); /* prepare the destination addresses */ dp = (mlib_d64 *)((mlib_addr) da & (~7)); dend = da + dsize * 4 - 1; dend2 = dend - 31; doff = ((mlib_addr) da & 7); /* set band mask for vis_pst_8 to store the bytes needed */ bmask = 0xff & (0x7777 >> doff) ; /* generate edge mask for the start point */ emask = vis_edge8(da, dend); /* load 24 bytes, convert to 32 bytes */ s3 = sp[0]; /* initial value */ LOAD_INSERT_U8_34R; if (doff == 0) { /* dst is 8-byte aligned */ if (dsize >= 8 ) { vis_pst_8(dd0, dp++, emask & bmask); vis_pst_8(dd1, dp++, bmask); vis_pst_8(dd2, dp++, bmask); vis_pst_8(dd3, dp++, bmask); } else { /* for very small size */ vis_pst_8(dd0, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(dd1, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(dd2, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(dd3, dp++, emask & bmask); } } } } /* no edge handling is needed in the loop */ if ((mlib_addr) dp <= (mlib_addr) dend2) { n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1; #pragma pipeloop(0) for (i = 0; i < n; i++) { LOAD_INSERT_U8_34R; vis_pst_8(dd0, dp++, bmask); vis_pst_8(dd1, dp++, bmask); vis_pst_8(dd2, dp++, bmask); vis_pst_8(dd3, dp++, bmask); } } if ((mlib_addr) dp <= (mlib_addr) dend) { LOAD_INSERT_U8_34R; emask = vis_edge8(dp, dend); vis_pst_8(dd0, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(dd1, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(dd2, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(dd3, dp++, emask & bmask); } } } } } else { /* (doff != 0) */ vis_alignaddr((void *)0, -doff); if (dsize >= 8 ) { vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask); vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask); vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask); vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask); } else { /* for very small size */ vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(vis_faligndata(dd3, dd3), dp++, emask & bmask); } } } } } /* no edge handling is needed in the loop */ if ((mlib_addr) dp <= (mlib_addr) dend2) { n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1; #pragma pipeloop(0) for (i = 0; i < n; i++) { LOAD_INSERT_U8_34R; vis_alignaddr((void *)0, -doff); vis_pst_8(vis_faligndata(dd4, dd0), dp++, bmask); vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask); vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask); vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask); } } if ((mlib_addr) dp <= (mlib_addr) dend) { LOAD_INSERT_U8_34R; vis_alignaddr((void *)0, -doff); emask = vis_edge8(dp, dend); vis_pst_8(vis_faligndata(dd4, dd0), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask); } } } } } } /***************************************************************/ void mlib_v_ImageChannelInsert_U8_34R(mlib_u8 *src, mlib_s32 slb, mlib_u8 *dst, mlib_s32 dlb, mlib_s32 xsize, mlib_s32 ysize) { mlib_u8 *sa, *da; mlib_u8 *sl, *dl; int j; sa = sl = src; da = dl = dst; #pragma pipeloop(0) for (j = 0; j < ysize; j++) { mlib_v_ImageChannelInsert_U8_34R_D1(sa, da, xsize); sa = sl += slb; da = dl += dlb; } } /***************************************************************/ #define INSERT_S16_34R \ vis_alignaddr((void *)0, 6); \ dd0 = vis_faligndata(sd0, sd0); /* b1b0g0r0 */ \ vis_alignaddr((void *)0, 4); \ dd1 = vis_faligndata(sd0, sd1); /* r0b1gbr1 */ \ vis_alignaddr((void *)0, 2); \ dd2 = vis_faligndata(sd1, sd2); /* r1b2g2r2 */ \ dd3 = sd2; /* r2b3g3r3 */ /***************************************************************/ #define LOAD_INSERT_STORE_S16_34R_A8 \ sd0 = *sp++; /* b0g0r0b1 */ \ sd1 = *sp++; /* g1r1b2g2 */ \ sd2 = *sp++; /* r2b3g3r3 */ \ INSERT_S16_34R \ vis_pst_16(dd0, dp++, bmask); \ vis_pst_16(dd1, dp++, bmask); \ vis_pst_16(dd2, dp++, bmask); \ vis_pst_16(dd3, dp++, bmask); /***************************************************************/ #define LOAD_INSERT_S16_34R \ vis_alignaddr((void *)soff, 0); \ s0 = s3; \ s1 = sp[1]; \ s2 = sp[2]; \ s3 = sp[3]; \ sd0 = vis_faligndata(s0, s1); \ sd1 = vis_faligndata(s1, s2); \ sd2 = vis_faligndata(s2, s3); \ sp += 3; \ dd4 = dd3; \ INSERT_S16_34R /***************************************************************/ /* * both source and destination image data are 1-d vectors and * 8-byte aligned. dsize is multiple of 4. */ void mlib_v_ImageChannelInsert_S16_34R_A8D1X4(mlib_s16 *src, mlib_s16 *dst, mlib_s32 dsize) { mlib_d64 *sp, *dp; /* 8-byte aligned pointer for pixel */ mlib_d64 sd0, sd1, sd2; /* source data */ mlib_d64 dd0, dd1, dd2, dd3; /* dst data */ int bmask = 0x07; /* channel mask */ int i; sp = (mlib_d64 *)src; dp = (mlib_d64 *)dst; /* set GSR.offset for vis_faligndata() */ /* vis_alignaddr((void *)0, 2); */ /* only for _old */ #pragma pipeloop(0) for (i = 0; i < dsize / 4; i++) { LOAD_INSERT_STORE_S16_34R_A8; } } /***************************************************************/ /* * either source or destination image data are not 1-d vectors, but * they are 8-byte aligned. xsize is multiple of 4. */ void mlib_v_ImageChannelInsert_S16_34R_A8D2X4(mlib_s16 *src, mlib_s32 slb, mlib_s16 *dst, mlib_s32 dlb, mlib_s32 xsize, mlib_s32 ysize) { mlib_d64 *sp, *dp; /* 8-byte aligned pointer for pixel */ mlib_d64 *sl, *dl; /* 8-byte aligned pointer for line */ mlib_d64 sd0, sd1, sd2; /* source data */ mlib_d64 dd0, dd1, dd2, dd3; /* dst data */ int bmask = 0x07; /* channel mask */ int i, j; /* indices for x, y */ sp = sl = (mlib_d64 *)src; dp = dl = (mlib_d64 *)dst; /* row loop */ for (j = 0; j < ysize; j++) { /* 4-pixel column loop */ #pragma pipeloop(0) for (i = 0; i < xsize / 4; i++) { LOAD_INSERT_STORE_S16_34R_A8; } sp = sl = (mlib_d64 *)((mlib_u8 *)sl + slb); dp = dl = (mlib_d64 *)((mlib_u8 *)dl + dlb); } } /***************************************************************/ /* * either source or destination data are not 8-byte aligned. */ void mlib_v_ImageChannelInsert_S16_34R_D1(mlib_s16 *src, mlib_s16 *dst, mlib_s32 dsize) { mlib_s16 *sa, *da; /* pointer for pixel */ mlib_s16 *dend, *dend2; /* end points in dst */ mlib_d64 *dp; /* 8-byte aligned start points in dst */ mlib_d64 *sp; /* 8-byte aligned start point in src */ mlib_d64 s0, s1, s2, s3; /* 8-byte source raw data */ mlib_d64 sd0, sd1, sd2; /* 8-byte source data */ mlib_d64 dd0, dd1, dd2, dd3; /* dst data */ mlib_d64 dd4; /* the last datum of the last step */ int soff; /* offset of address in src */ int doff; /* offset of address in dst */ int emask; /* edge mask */ int bmask; /* channel mask */ int i, n; sa = src; da = dst; /* prepare the source address */ sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); soff = ((mlib_addr) sa & 7); /* prepare the destination addresses */ dp = (mlib_d64 *)((mlib_addr) da & (~7)); dend = da + dsize * 4 - 1; dend2 = dend - 15; doff = ((mlib_addr) da & 7); /* set channel mask for vis_pst_16 to store the words needed */ bmask = 0xff & (0x77 >> (doff / 2)); /* generate edge mask for the start point */ emask = vis_edge16(da, dend); /* load 24 byte, convert, store 32 bytes */ s3 = sp[0]; /* initial value */ LOAD_INSERT_S16_34R; if (doff == 0) { /* dst is 8-byte aligned */ if (dsize >= 4 ) { vis_pst_16(dd0, dp++, emask & bmask); vis_pst_16(dd1, dp++, bmask); vis_pst_16(dd2, dp++, bmask); vis_pst_16(dd3, dp++, bmask); } else { /* for very small size */ vis_pst_16(dd0, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(dd1, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(dd2, dp++, emask & bmask); } } } /* no edge handling is needed in the loop */ if ((mlib_addr) dp <= (mlib_addr) dend2) { n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1; #pragma pipeloop(0) for (i = 0; i < n; i++) { LOAD_INSERT_S16_34R; vis_pst_16(dd0, dp++, bmask); vis_pst_16(dd1, dp++, bmask); vis_pst_16(dd2, dp++, bmask); vis_pst_16(dd3, dp++, bmask); } } if ((mlib_addr) dp <= (mlib_addr) dend) { LOAD_INSERT_S16_34R; emask = vis_edge16(dp, dend); vis_pst_16(dd0, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(dd1, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(dd2, dp++, emask & bmask); } } } } else { /* (doff != 0) */ vis_alignaddr((void *)0, -doff); if (dsize >= 4 ) { vis_pst_16(vis_faligndata(dd0, dd0), dp++, emask & bmask); vis_pst_16(vis_faligndata(dd0, dd1), dp++, bmask); vis_pst_16(vis_faligndata(dd1, dd2), dp++, bmask); vis_pst_16(vis_faligndata(dd2, dd3), dp++, bmask); } else { /* for very small size */ vis_pst_16(vis_faligndata(dd0, dd0), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(vis_faligndata(dd1, dd2), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(vis_faligndata(dd2, dd3), dp++, emask & bmask); } } } } /* no edge handling is needed in the loop */ if ((mlib_addr) dp <= (mlib_addr) dend2) { n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1; #pragma pipeloop(0) for (i = 0; i < n; i++) { LOAD_INSERT_S16_34R; vis_alignaddr((void *)0, -doff); vis_pst_16(vis_faligndata(dd4, dd0), dp++, bmask); vis_pst_16(vis_faligndata(dd0, dd1), dp++, bmask); vis_pst_16(vis_faligndata(dd1, dd2), dp++, bmask); vis_pst_16(vis_faligndata(dd2, dd3), dp++, bmask); } } if ((mlib_addr) dp <= (mlib_addr) dend) { LOAD_INSERT_S16_34R; vis_alignaddr((void *)0, -doff); emask = vis_edge16(dp, dend); vis_pst_16(vis_faligndata(dd4, dd0), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(vis_faligndata(dd1, dd2), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(vis_faligndata(dd2, dd3), dp++, emask & bmask); } } } } } } /***************************************************************/ void mlib_v_ImageChannelInsert_S16_34R(mlib_s16 *src, mlib_s32 slb, mlib_s16 *dst, mlib_s32 dlb, mlib_s32 xsize, mlib_s32 ysize) { mlib_s16 *sa, *da; mlib_s16 *sl, *dl; int j; sa = sl = src; da = dl = dst; #pragma pipeloop(0) for (j = 0; j < ysize; j++) { mlib_v_ImageChannelInsert_S16_34R_D1(sa, da, xsize); sa = sl = (mlib_s16 *)((mlib_u8 *)sl + slb); da = dl = (mlib_s16 *)((mlib_u8 *)dl + dlb); } } /***************************************************************/ #define INSERT_U8_34L \ sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \ sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \ sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \ sdd = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \ sde = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdc)); \ sdf = vis_fpmerge(vis_read_hi(sdb), vis_read_lo(sdc)); \ sdg = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sde)); \ sdh = vis_fpmerge(vis_read_lo(sdd), vis_read_hi(sdf)); \ sdi = vis_fpmerge(vis_read_hi(sde), vis_read_lo(sdf)); \ sdj = vis_fpmerge(vis_read_hi(sdg), vis_read_hi(sdi)); \ sdk = vis_fpmerge(vis_read_lo(sdg), vis_read_lo(sdi)); \ sdl = vis_fpmerge(vis_read_hi(sdh), vis_read_hi(sdh)); \ sdm = vis_fpmerge(vis_read_lo(sdh), vis_read_lo(sdh)); \ dd0 = vis_fpmerge(vis_read_hi(sdj), vis_read_hi(sdl)); \ dd1 = vis_fpmerge(vis_read_lo(sdj), vis_read_lo(sdl)); \ dd2 = vis_fpmerge(vis_read_hi(sdk), vis_read_hi(sdm)); \ dd3 = vis_fpmerge(vis_read_lo(sdk), vis_read_lo(sdm)); /***************************************************************/ #define LOAD_INSERT_STORE_U8_34L_A8 \ sd0 = *sp++; /* b0g0r0b1g1r1b2g2 */ \ sd1 = *sp++; /* r2b3g3r3b4g4r4b5 */ \ sd2 = *sp++; /* g5r5b6g6r6b7g7r7 */ \ INSERT_U8_34L \ vis_pst_8(dd0, dp++, bmask); \ vis_pst_8(dd1, dp++, bmask); \ vis_pst_8(dd2, dp++, bmask); \ vis_pst_8(dd3, dp++, bmask); /***************************************************************/ #define LOAD_INSERT_U8_34L \ vis_alignaddr((void *)soff, 0); \ s0 = s3; \ s1 = sp[1]; \ s2 = sp[2]; \ s3 = sp[3]; \ sd0 = vis_faligndata(s0, s1); \ sd1 = vis_faligndata(s1, s2); \ sd2 = vis_faligndata(s2, s3); \ sp += 3; \ dd4 = dd3; \ INSERT_U8_34L /***************************************************************/ /* * Both source and destination image data are 1-d vectors and * 8-byte aligned. And dsize is multiple of 8. */ void mlib_v_ImageChannelInsert_U8_34L_A8D1X8(mlib_u8 *src, mlib_u8 *dst, mlib_s32 dsize) { mlib_d64 *sp, *dp; mlib_d64 sd0, sd1, sd2; /* source data */ mlib_d64 dd0, dd1, dd2, dd3; /* dst data */ mlib_d64 sda, sdb, sdc, sdd; /* intermediate variables */ mlib_d64 sde, sdf, sdg, sdh; mlib_d64 sdi, sdj, sdk, sdl; mlib_d64 sdm; int bmask = 0xee; int i; sp = (mlib_d64 *)src; dp = (mlib_d64 *)dst; #pragma pipeloop(0) for (i = 0; i < dsize / 8; i++) { LOAD_INSERT_STORE_U8_34L_A8; } } /***************************************************************/ /* * Either source or destination image data are not 1-d vectors, but * they are 8-byte aligned. And slb and dlb are multiple of 8. * The xsize is multiple of 8. */ void mlib_v_ImageChannelInsert_U8_34L_A8D2X8(mlib_u8 *src, mlib_s32 slb, mlib_u8 *dst, mlib_s32 dlb, mlib_s32 xsize, mlib_s32 ysize) { mlib_d64 *sp, *dp; /* 8-byte aligned pointer for pixel */ mlib_d64 *sl, *dl; /* 8-byte aligned pointer for line */ mlib_d64 sd0, sd1, sd2; /* source data */ mlib_d64 dd0, dd1, dd2, dd3; /* dst data */ mlib_d64 sda, sdb, sdc, sdd; /* intermediate variables */ mlib_d64 sde, sdf, sdg, sdh; mlib_d64 sdi, sdj, sdk, sdl; mlib_d64 sdm; int bmask = 0xee; int i, j; /* indices for x, y */ sp = sl = (mlib_d64 *)src; dp = dl = (mlib_d64 *)dst; /* row loop */ for (j = 0; j < ysize; j++) { /* 8-byte column loop */ #pragma pipeloop(0) for (i = 0; i < xsize / 8; i++) { LOAD_INSERT_STORE_U8_34L_A8; } sp = sl = (mlib_d64 *)((mlib_u8 *)sl + slb); dp = dl = (mlib_d64 *)((mlib_u8 *)dl + dlb); } } /***************************************************************/ /* * either source or destination data are not 8-byte aligned. */ void mlib_v_ImageChannelInsert_U8_34L_D1(mlib_u8 *src, mlib_u8 *dst, mlib_s32 dsize) { mlib_u8 *sa, *da; mlib_u8 *dend, *dend2; /* end points in dst */ mlib_d64 *dp; /* 8-byte aligned start points in dst */ mlib_d64 *sp; /* 8-byte aligned start point in src */ mlib_d64 s0, s1, s2, s3; /* 8-byte source raw data */ mlib_d64 sd0, sd1, sd2; /* 8-byte source data */ mlib_d64 dd0, dd1, dd2, dd3; /* dst data */ mlib_d64 dd4; /* the last datum of the last step */ mlib_d64 sda, sdb, sdc, sdd; /* intermediate variables */ mlib_d64 sde, sdf, sdg, sdh; mlib_d64 sdi, sdj, sdk, sdl; mlib_d64 sdm; int soff; /* offset of address in src */ int doff; /* offset of address in dst */ int emask; /* edge mask */ int bmask; /* channel mask */ int i, n; sa = src; da = dst; /* prepare the source address */ sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); soff = ((mlib_addr) sa & 7); /* prepare the destination addresses */ dp = (mlib_d64 *)((mlib_addr) da & (~7)); dend = da + dsize * 4 - 1; dend2 = dend - 31; doff = ((mlib_addr) da & 7); /* set band mask for vis_pst_8 to store the bytes needed */ bmask = 0xff & (0xeeee >> doff) ; /* generate edge mask for the start point */ emask = vis_edge8(da, dend); /* load 24 bytes, convert to 32 bytes */ s3 = sp[0]; /* initial value */ LOAD_INSERT_U8_34L; if (doff == 0) { /* dst is 8-byte aligned */ if (dsize >= 8 ) { vis_pst_8(dd0, dp++, emask & bmask); vis_pst_8(dd1, dp++, bmask); vis_pst_8(dd2, dp++, bmask); vis_pst_8(dd3, dp++, bmask); } else { /* for very small size */ vis_pst_8(dd0, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(dd1, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(dd2, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(dd3, dp++, emask & bmask); } } } } /* no edge handling is needed in the loop */ if ((mlib_addr) dp <= (mlib_addr) dend2) { n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1; #pragma pipeloop(0) for (i = 0; i < n; i++) { LOAD_INSERT_U8_34L; vis_pst_8(dd0, dp++, bmask); vis_pst_8(dd1, dp++, bmask); vis_pst_8(dd2, dp++, bmask); vis_pst_8(dd3, dp++, bmask); } } if ((mlib_addr) dp <= (mlib_addr) dend) { LOAD_INSERT_U8_34L; emask = vis_edge8(dp, dend); vis_pst_8(dd0, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(dd1, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(dd2, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(dd3, dp++, emask & bmask); } } } } } else { /* (doff != 0) */ vis_alignaddr((void *)0, -doff); if (dsize >= 8 ) { vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask); vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask); vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask); vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask); } else { /* for very small size */ vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(vis_faligndata(dd3, dd3), dp++, emask & bmask); } } } } } /* no edge handling is needed in the loop */ if ((mlib_addr) dp <= (mlib_addr) dend2) { n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1; #pragma pipeloop(0) for (i = 0; i < n; i++) { LOAD_INSERT_U8_34L; vis_alignaddr((void *)0, -doff); vis_pst_8(vis_faligndata(dd4, dd0), dp++, bmask); vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask); vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask); vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask); } } if ((mlib_addr) dp <= (mlib_addr) dend) { LOAD_INSERT_U8_34L; vis_alignaddr((void *)0, -doff); emask = vis_edge8(dp, dend); vis_pst_8(vis_faligndata(dd4, dd0), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge8(dp, dend); vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask); } } } } } } /***************************************************************/ void mlib_v_ImageChannelInsert_U8_34L(mlib_u8 *src, mlib_s32 slb, mlib_u8 *dst, mlib_s32 dlb, mlib_s32 xsize, mlib_s32 ysize) { mlib_u8 *sa, *da; mlib_u8 *sl, *dl; int j; sa = sl = src; da = dl = dst; #pragma pipeloop(0) for (j = 0; j < ysize; j++) { mlib_v_ImageChannelInsert_U8_34L_D1(sa, da, xsize); sa = sl += slb; da = dl += dlb; } } /***************************************************************/ #define INSERT_S16_34L \ dd0 = sd0; /* b0g0r0b1 */ \ vis_alignaddr((void *)0, 6); \ dd1 = vis_faligndata(sd0, sd1); /* b1gbr1b2 */ \ vis_alignaddr((void *)0, 4); \ dd2 = vis_faligndata(sd1, sd2); /* b2g2r2b3 */ \ vis_alignaddr((void *)0, 2); \ dd3 = vis_faligndata(sd2, sd2); /* b3g3r3r2 */ /***************************************************************/ #define LOAD_INSERT_STORE_S16_34L_A8 \ sd0 = *sp++; /* b0g0r0b1 */ \ sd1 = *sp++; /* g1r1b2g2 */ \ sd2 = *sp++; /* r2b3g3r3 */ \ INSERT_S16_34L \ vis_pst_16(dd0, dp++, bmask); \ vis_pst_16(dd1, dp++, bmask); \ vis_pst_16(dd2, dp++, bmask); \ vis_pst_16(dd3, dp++, bmask); /***************************************************************/ #define LOAD_INSERT_S16_34L \ vis_alignaddr((void *)soff, 0); \ s0 = s3; \ s1 = sp[1]; \ s2 = sp[2]; \ s3 = sp[3]; \ sd0 = vis_faligndata(s0, s1); \ sd1 = vis_faligndata(s1, s2); \ sd2 = vis_faligndata(s2, s3); \ sp += 3; \ dd4 = dd3; \ INSERT_S16_34L /***************************************************************/ /* * both source and destination image data are 1-d vectors and * 8-byte aligned. dsize is multiple of 4. */ void mlib_v_ImageChannelInsert_S16_34L_A8D1X4(mlib_s16 *src, mlib_s16 *dst, mlib_s32 dsize) { mlib_d64 *sp, *dp; /* 8-byte aligned pointer for pixel */ mlib_d64 sd0, sd1, sd2; /* source data */ mlib_d64 dd0, dd1, dd2, dd3; /* dst data */ int bmask = 0x0e; /* channel mask */ int i; sp = (mlib_d64 *)src; dp = (mlib_d64 *)dst; #pragma pipeloop(0) for (i = 0; i < dsize / 4; i++) { LOAD_INSERT_STORE_S16_34L_A8; } } /***************************************************************/ /* * either source or destination image data are not 1-d vectors, but * they are 8-byte aligned. xsize is multiple of 4. */ void mlib_v_ImageChannelInsert_S16_34L_A8D2X4(mlib_s16 *src, mlib_s32 slb, mlib_s16 *dst, mlib_s32 dlb, mlib_s32 xsize, mlib_s32 ysize) { mlib_d64 *sp, *dp; /* 8-byte aligned pointer for pixel */ mlib_d64 *sl, *dl; /* 8-byte aligned pointer for line */ mlib_d64 sd0, sd1, sd2; /* source data */ mlib_d64 dd0, dd1, dd2, dd3; /* dst data */ int bmask = 0x0e; /* channel mask */ int i, j; /* indices for x, y */ sp = sl = (mlib_d64 *)src; dp = dl = (mlib_d64 *)dst; /* row loop */ for (j = 0; j < ysize; j++) { /* 4-pixel column loop */ #pragma pipeloop(0) for (i = 0; i < xsize / 4; i++) { LOAD_INSERT_STORE_S16_34L_A8; } sp = sl = (mlib_d64 *)((mlib_u8 *)sl + slb); dp = dl = (mlib_d64 *)((mlib_u8 *)dl + dlb); } } /***************************************************************/ /* * either source or destination data are not 8-byte aligned. */ void mlib_v_ImageChannelInsert_S16_34L_D1(mlib_s16 *src, mlib_s16 *dst, mlib_s32 dsize) { mlib_s16 *sa, *da; /* pointer for pixel */ mlib_s16 *dend, *dend2; /* end points in dst */ mlib_d64 *dp; /* 8-byte aligned start points in dst */ mlib_d64 *sp; /* 8-byte aligned start point in src */ mlib_d64 s0, s1, s2, s3; /* 8-byte source raw data */ mlib_d64 sd0, sd1, sd2; /* 8-byte source data */ mlib_d64 dd0, dd1, dd2, dd3; /* dst data */ mlib_d64 dd4; /* the last datum of the last step */ int soff; /* offset of address in src */ int doff; /* offset of address in dst */ int emask; /* edge mask */ int bmask; /* channel mask */ int i, n; sa = src; da = dst; /* prepare the source address */ sp = (mlib_d64 *) ((mlib_addr) sa & (~7)); soff = ((mlib_addr) sa & 7); /* prepare the destination addresses */ dp = (mlib_d64 *)((mlib_addr) da & (~7)); dend = da + dsize * 4 - 1; dend2 = dend - 15; doff = ((mlib_addr) da & 7); /* set channel mask for vis_pst_16 to store the words needed */ bmask = 0xff & (0xee >> (doff / 2)); /* generate edge mask for the start point */ emask = vis_edge16(da, dend); /* load 24 byte, convert, store 32 bytes */ s3 = sp[0]; /* initial value */ LOAD_INSERT_S16_34L; if (doff == 0) { /* dst is 8-byte aligned */ if (dsize >= 4 ) { vis_pst_16(dd0, dp++, emask & bmask); vis_pst_16(dd1, dp++, bmask); vis_pst_16(dd2, dp++, bmask); vis_pst_16(dd3, dp++, bmask); } else { /* for very small size */ vis_pst_16(dd0, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(dd1, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(dd2, dp++, emask & bmask); } } } /* no edge handling is needed in the loop */ if ((mlib_addr) dp <= (mlib_addr) dend2) { n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1; #pragma pipeloop(0) for (i = 0; i < n; i++) { LOAD_INSERT_S16_34L; vis_pst_16(dd0, dp++, bmask); vis_pst_16(dd1, dp++, bmask); vis_pst_16(dd2, dp++, bmask); vis_pst_16(dd3, dp++, bmask); } } if ((mlib_addr) dp <= (mlib_addr) dend) { LOAD_INSERT_S16_34L; emask = vis_edge16(dp, dend); vis_pst_16(dd0, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(dd1, dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(dd2, dp++, emask & bmask); } } } } else { /* (doff != 0) */ vis_alignaddr((void *)0, -doff); if (dsize >= 4 ) { vis_pst_16(vis_faligndata(dd0, dd0), dp++, emask & bmask); vis_pst_16(vis_faligndata(dd0, dd1), dp++, bmask); vis_pst_16(vis_faligndata(dd1, dd2), dp++, bmask); vis_pst_16(vis_faligndata(dd2, dd3), dp++, bmask); } else { /* for very small size */ vis_pst_16(vis_faligndata(dd0, dd0), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(vis_faligndata(dd1, dd2), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(vis_faligndata(dd2, dd3), dp++, emask & bmask); } } } } /* no edge handling is needed in the loop */ if ((mlib_addr) dp <= (mlib_addr) dend2) { n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1; #pragma pipeloop(0) for (i = 0; i < n; i++) { LOAD_INSERT_S16_34L; vis_alignaddr((void *)0, -doff); vis_pst_16(vis_faligndata(dd4, dd0), dp++, bmask); vis_pst_16(vis_faligndata(dd0, dd1), dp++, bmask); vis_pst_16(vis_faligndata(dd1, dd2), dp++, bmask); vis_pst_16(vis_faligndata(dd2, dd3), dp++, bmask); } } if ((mlib_addr) dp <= (mlib_addr) dend) { LOAD_INSERT_S16_34L; vis_alignaddr((void *)0, -doff); emask = vis_edge16(dp, dend); vis_pst_16(vis_faligndata(dd4, dd0), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(vis_faligndata(dd1, dd2), dp++, emask & bmask); if ((mlib_addr) dp <= (mlib_addr) dend) { emask = vis_edge16(dp, dend); vis_pst_16(vis_faligndata(dd2, dd3), dp++, emask & bmask); } } } } } } /***************************************************************/ void mlib_v_ImageChannelInsert_S16_34L(mlib_s16 *src, mlib_s32 slb, mlib_s16 *dst, mlib_s32 dlb, mlib_s32 xsize, mlib_s32 ysize) { mlib_s16 *sa, *da; mlib_s16 *sl, *dl; int j; sa = sl = src; da = dl = dst; #pragma pipeloop(0) for (j = 0; j < ysize; j++) { mlib_v_ImageChannelInsert_S16_34L_D1(sa, da, xsize); sa = sl = (mlib_s16 *)((mlib_u8 *)sl + slb); da = dl = (mlib_s16 *)((mlib_u8 *)dl + dlb); } } /***************************************************************/