M4RI  1.0.1
mzd.h
Go to the documentation of this file.
00001 
00010 #ifndef M4RI_MZD
00011 #define M4RI_MZD
00012 
00013 /*******************************************************************
00014 *
00015 *                M4RI: Linear Algebra over GF(2)
00016 *
00017 *    Copyright (C) 2007, 2008 Gregory Bard <bard@fordham.edu>
00018 *    Copyright (C) 2008-2010 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
00019 *    Copyright (C) 2011 Carlo Wood <carlo@alinoe.com>
00020 *
00021 *  Distributed under the terms of the GNU General Public License (GPL)
00022 *  version 2 or higher.
00023 *
00024 *    This code is distributed in the hope that it will be useful,
00025 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
00026 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00027 *    General Public License for more details.
00028 *
00029 *  The full text of the GPL is available at:
00030 *
00031 *                  http://www.gnu.org/licenses/
00032 *
00033 ********************************************************************/
00034 
00035 #include "m4ri_config.h"
00036 
00037 #include <math.h>
00038 #include <assert.h>
00039 #include <stdio.h>
00040 
00041 #if __M4RI_HAVE_SSE2
00042 #include <emmintrin.h>
00043 #endif
00044 
00045 #include "misc.h"
00046 #include "debug_dump.h"
00047 
00048 #if __M4RI_HAVE_SSE2
00049 
00056 #define __M4RI_SSE2_CUTOFF 10
00057 #endif
00058 
00065 #define __M4RI_MAX_MZD_BLOCKSIZE (((size_t)1) << 27)
00066 
00075 #define __M4RI_MUL_BLOCKSIZE MIN(((int)sqrt((double)(4 * __M4RI_CPU_L2_CACHE))) / 2, 2048)
00076 
00077 typedef struct {
00078   size_t size;
00079   word* begin;
00080   word* end;
00081 } mzd_block_t;
00082 
00089 typedef struct mzd_t {
00094   rci_t nrows;
00095 
00100   rci_t ncols;
00101 
00108   wi_t width; 
00109 
00117   wi_t rowstride;
00118 
00126   wi_t offset_vector;
00127 
00132   wi_t row_offset;
00133 
00138   uint16_t offset;
00139 
00153   uint8_t flags;
00154 
00160   uint8_t blockrows_log;
00161 
00162 #if 0   // Commented out in order to keep the size of mzd_t 64 bytes (one cache line). This could be added back if rows was ever removed.
00163 
00168   int blockrows_mask;
00169 #endif
00170 
00175   word high_bitmask;
00176 
00181   word low_bitmask;
00182 
00188   mzd_block_t *blocks;
00189 
00195   word **rows;
00196 
00197 } mzd_t;
00198 
00202 static wi_t const mzd_paddingwidth = 3;
00203 
00204 static uint8_t const mzd_flag_nonzero_offset = 0x1;
00205 static uint8_t const mzd_flag_nonzero_excess = 0x2;
00206 static uint8_t const mzd_flag_windowed_zerooffset = 0x4;
00207 static uint8_t const mzd_flag_windowed_zeroexcess = 0x8;
00208 static uint8_t const mzd_flag_windowed_ownsblocks = 0x10;
00209 static uint8_t const mzd_flag_multiple_blocks = 0x20;
00210 
00218 static inline int mzd_is_windowed(mzd_t const *M) {
00219   return M->flags & (mzd_flag_nonzero_offset | mzd_flag_windowed_zerooffset);
00220 }
00221 
00229 static inline int mzd_owns_blocks(mzd_t const *M) {
00230   return M->blocks && (!mzd_is_windowed(M) || ((M->flags & mzd_flag_windowed_ownsblocks)));
00231 }
00232 
00241 static inline word* mzd_first_row(mzd_t const *M) {
00242   word* result = M->blocks[0].begin + M->offset_vector;
00243   assert(M->nrows == 0 || result == M->rows[0]);
00244   return result;
00245 }
00246 
00257 static inline word* mzd_first_row_next_block(mzd_t const* M, int n) {
00258   assert(n > 0);
00259   return M->blocks[n].begin + M->offset_vector - M->row_offset * M->rowstride;
00260 }
00261 
00271 static inline int mzd_row_to_block(mzd_t const* M, rci_t row) {
00272   return (M->row_offset + row) >> M->blockrows_log;
00273 }
00274 
00288 static inline wi_t mzd_rows_in_block(mzd_t const* M, int n) {
00289   if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) {
00290     if (__M4RI_UNLIKELY(n == 0)) {
00291       return (1 << M->blockrows_log) - M->row_offset;
00292     } else {
00293       int const last_block = mzd_row_to_block(M, M->nrows - 1); 
00294       if (n < last_block)
00295         return (1 << M->blockrows_log);
00296       return M->nrows + M->row_offset - (n << M->blockrows_log);
00297     }
00298   }
00299   return n ? 0 : M->nrows;
00300 }
00301 
00311 static inline word* mzd_row(mzd_t const* M, rci_t row) {
00312   wi_t big_vector = M->offset_vector + row * M->rowstride;
00313   word* result = M->blocks[0].begin + big_vector;
00314   if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) {
00315     int const n = (M->row_offset + row) >> M->blockrows_log;
00316     result = M->blocks[n].begin + big_vector - n * (M->blocks[0].size / sizeof(word));
00317   }
00318   assert(result == M->rows[row]);
00319   return result;
00320 }
00321 
00332 mzd_t *mzd_init(rci_t const r, rci_t const c);
00333 
00340 void mzd_free(mzd_t *A);
00341 
00342 
00364 mzd_t *mzd_init_window(mzd_t *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc);
00365 
00372 static inline mzd_t const *mzd_init_window_const(mzd_t const *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc)
00373 {
00374   return mzd_init_window((mzd_t*)M, lowr, lowc, highr, highc);
00375 }
00376 
00383 #define mzd_free_window mzd_free
00384 
00394 static inline void _mzd_row_swap(mzd_t *M, rci_t const rowa, rci_t const rowb, wi_t const startblock) {
00395   if ((rowa == rowb) || (startblock >= M->width))
00396     return;
00397 
00398   /* This is the case since we're only called from _mzd_ple_mmpf,
00399    * which makes the same assumption. Therefore we don't need
00400    * to take a mask_begin into account. */
00401   assert(M->offset == 0);
00402 
00403   wi_t width = M->width - startblock - 1;
00404   word *a = M->rows[rowa] + startblock;
00405   word *b = M->rows[rowb] + startblock;
00406   word tmp; 
00407   word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix);
00408 
00409   if (width != 0) {
00410     for(wi_t i = 0; i < width; ++i) {
00411       tmp = a[i];
00412       a[i] = b[i];
00413       b[i] = tmp;
00414     }
00415   }
00416   tmp = (a[width] ^ b[width]) & mask_end;
00417   a[width] ^= tmp;
00418   b[width] ^= tmp;
00419 
00420   __M4RI_DD_ROW(M, rowa);
00421   __M4RI_DD_ROW(M, rowb);
00422 }
00423 
00432 static inline void mzd_row_swap(mzd_t *M, rci_t const rowa, rci_t const rowb) {
00433   if(rowa == rowb)
00434     return;
00435 
00436   wi_t width = M->width - 1;
00437   word *a = M->rows[rowa];
00438   word *b = M->rows[rowb];
00439   word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - M->offset);
00440   word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix);
00441 
00442   word tmp = (a[0] ^ b[0]) & mask_begin;
00443   if (width != 0) {
00444     a[0] ^= tmp;
00445     b[0] ^= tmp;
00446     
00447     for(wi_t i = 1; i < width; ++i) {
00448       tmp = a[i];
00449       a[i] = b[i];
00450       b[i] = tmp;
00451     }
00452     tmp = (a[width] ^ b[width]) & mask_end;
00453     a[width] ^= tmp;
00454     b[width] ^= tmp;
00455     
00456   } else {
00457     tmp &= mask_end;
00458     a[0] ^= tmp;
00459     b[0] ^= tmp;
00460   }
00461 
00462   __M4RI_DD_ROW(M, rowa);
00463   __M4RI_DD_ROW(M, rowb);
00464 }
00465 
00478 void mzd_copy_row(mzd_t *B, rci_t i, mzd_t const *A, rci_t j);
00479 
00488 void mzd_col_swap(mzd_t *M, rci_t const cola, rci_t const colb);
00489 
00500 static inline void mzd_col_swap_in_rows(mzd_t *M, rci_t const cola, rci_t const colb, rci_t const start_row, rci_t const stop_row) {
00501   if (cola == colb)
00502     return;
00503 
00504   rci_t const _cola = cola + M->offset;
00505   rci_t const _colb = colb + M->offset;
00506 
00507   wi_t const a_word = _cola / m4ri_radix;
00508   wi_t const b_word = _colb / m4ri_radix;
00509 
00510   int const a_bit = _cola % m4ri_radix;
00511   int const b_bit = _colb % m4ri_radix;
00512 
00513   word* RESTRICT ptr = mzd_row(M, start_row);
00514   int max_bit = MAX(a_bit, b_bit);
00515   int count_remaining = stop_row - start_row;
00516   int min_bit = a_bit + b_bit - max_bit;
00517   int block = mzd_row_to_block(M, start_row);
00518   int offset = max_bit - min_bit;
00519   word mask = m4ri_one << min_bit;
00520   int count = MIN(mzd_rows_in_block(M, 0), count_remaining);
00521   // Apparently we're calling with start_row == stop_row sometimes (seems a bug to me).
00522   if (count <= 0)
00523     return;
00524 
00525   if (a_word == b_word) {
00526     while(1) {
00527       count_remaining -= count;
00528       ptr += a_word;
00529       int fast_count = count / 4;
00530       int rest_count = count - 4 * fast_count;
00531       word xor_v[4];
00532       wi_t const rowstride = M->rowstride;
00533       while (fast_count--) {
00534         xor_v[0] = ptr[0];
00535         xor_v[1] = ptr[rowstride];
00536         xor_v[2] = ptr[2 * rowstride];
00537         xor_v[3] = ptr[3 * rowstride];
00538         xor_v[0] ^= xor_v[0] >> offset;
00539         xor_v[1] ^= xor_v[1] >> offset;
00540         xor_v[2] ^= xor_v[2] >> offset;
00541         xor_v[3] ^= xor_v[3] >> offset;
00542         xor_v[0] &= mask;
00543         xor_v[1] &= mask;
00544         xor_v[2] &= mask;
00545         xor_v[3] &= mask;
00546         xor_v[0] |= xor_v[0] << offset;
00547         xor_v[1] |= xor_v[1] << offset;
00548         xor_v[2] |= xor_v[2] << offset;
00549         xor_v[3] |= xor_v[3] << offset;
00550         ptr[0] ^= xor_v[0];
00551         ptr[rowstride] ^= xor_v[1];
00552         ptr[2 * rowstride] ^= xor_v[2];
00553         ptr[3 * rowstride] ^= xor_v[3];
00554         ptr += 4 * rowstride;
00555       }
00556       while (rest_count--) {
00557         word xor_v = *ptr;
00558         xor_v ^= xor_v >> offset;
00559         xor_v &= mask;
00560         *ptr ^= xor_v | (xor_v << offset);
00561         ptr += rowstride;
00562       }
00563       if ((count = MIN(mzd_rows_in_block(M, ++block), count_remaining)) <= 0)
00564         break;
00565       ptr = mzd_first_row_next_block(M, block);
00566     }
00567   } else {
00568     word* RESTRICT min_ptr;
00569     wi_t max_offset;
00570     if (min_bit == a_bit) {
00571       min_ptr = ptr + a_word;
00572       max_offset = b_word - a_word;
00573     } else {
00574       min_ptr = ptr + b_word;
00575       max_offset = a_word - b_word;
00576     }
00577     while(1) {
00578       count_remaining -= count;
00579       wi_t const rowstride = M->rowstride;
00580       while(count--) {
00581         word xor_v = (min_ptr[0] ^ (min_ptr[max_offset] >> offset)) & mask;
00582         min_ptr[0] ^= xor_v;
00583         min_ptr[max_offset] ^= xor_v << offset;
00584         min_ptr += rowstride;
00585       }
00586       if ((count = MIN(mzd_rows_in_block(M, ++block), count_remaining)) <= 0)
00587         break;
00588       ptr = mzd_first_row_next_block(M, block);
00589       if (min_bit == a_bit)
00590         min_ptr = ptr + a_word;
00591       else
00592         min_ptr = ptr + b_word;
00593     }
00594   }
00595 
00596   __M4RI_DD_MZD(M);
00597 }
00598 
00610 static inline BIT mzd_read_bit(mzd_t const *M, rci_t const row, rci_t const col ) {
00611   return __M4RI_GET_BIT(M->rows[row][(col+M->offset)/m4ri_radix], (col+M->offset) % m4ri_radix);
00612 }
00613 
00626 static inline void mzd_write_bit(mzd_t *M, rci_t const row, rci_t const col, BIT const value) {
00627   __M4RI_WRITE_BIT(M->rows[row][(col + M->offset) / m4ri_radix], (col + M->offset) % m4ri_radix, value);
00628 }
00629 
00630 
00641 static inline void mzd_xor_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n, word values) {
00642   int const spot = (y + M->offset) % m4ri_radix;
00643   wi_t const block = (y + M->offset) / m4ri_radix;
00644   M->rows[x][block] ^= values << spot;
00645   int const space = m4ri_radix - spot;
00646   if (n > space)
00647     M->rows[x][block + 1] ^= values >> space;
00648 }
00649 
00660 static inline void mzd_and_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n, word values) {
00661   /* This is the best way, since this will drop out once we inverse the bits in values: */
00662   values >>= (m4ri_radix - n);  /* Move the bits to the lowest columns */
00663 
00664   int const spot = (y + M->offset) % m4ri_radix;
00665   wi_t const block = (y + M->offset) / m4ri_radix;
00666   M->rows[x][block] &= values << spot;
00667   int const space = m4ri_radix - spot;
00668   if (n > space)
00669     M->rows[x][block + 1] &= values >> space;
00670 }
00671 
00681 static inline void mzd_clear_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n) {
00682   assert(n>0 && n <= m4ri_radix);
00683   word values = m4ri_ffff >> (m4ri_radix - n);
00684   int const spot = (y + M->offset) % m4ri_radix;
00685   wi_t const block = (y + M->offset) / m4ri_radix;
00686   M->rows[x][block] &= ~(values << spot);
00687   int const space = m4ri_radix - spot;
00688   if (n > space)
00689     M->rows[x][block + 1] &= ~(values >> space);
00690 }
00691 
00704 static inline void mzd_row_add_offset(mzd_t *M, rci_t dstrow, rci_t srcrow, rci_t coloffset) {
00705   assert(dstrow < M->nrows && srcrow < M->nrows && coloffset < M->ncols);
00706   coloffset += M->offset;
00707   wi_t const startblock= coloffset/m4ri_radix;
00708   wi_t wide = M->width - startblock;
00709   word *src = M->rows[srcrow] + startblock;
00710   word *dst = M->rows[dstrow] + startblock;
00711   word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - coloffset % m4ri_radix);
00712   word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix);
00713 
00714   *dst++ ^= *src++ & mask_begin;
00715   --wide;
00716 
00717 #if __M4RI_HAVE_SSE2 
00718   int not_aligned = __M4RI_ALIGNMENT(src,16) != 0;      /* 0: Aligned, 1: Not aligned */
00719   if (wide > not_aligned + 1)                           /* Speed up for small matrices */
00720   {
00721     if (not_aligned) {
00722       *dst++ ^= *src++;
00723       --wide;
00724     }
00725     /* Now wide > 1 */
00726     __m128i* __src = (__m128i*)src;
00727     __m128i* __dst = (__m128i*)dst;
00728     __m128i* const eof = (__m128i*)((unsigned long)(src + wide) & ~0xFUL);
00729     do
00730     {
00731       __m128i xmm1 = _mm_xor_si128(*__dst, *__src);
00732       *__dst++ = xmm1;
00733     }
00734     while(++__src < eof);
00735     src  = (word*)__src;
00736     dst = (word*)__dst;
00737     wide = ((sizeof(word)*wide)%16)/sizeof(word);
00738   }
00739 #endif
00740   wi_t i = -1;
00741   while(++i < wide)
00742     dst[i] ^= src[i];
00743   /* 
00744    * Revert possibly non-zero excess bits.
00745    * Note that i == wide here, and wide can be 0.
00746    * But really, src[wide - 1] is M->rows[srcrow][M->width - 1] ;)
00747    * We use i - 1 here to let the compiler know these are the same addresses
00748    * that we last accessed, in the previous loop.
00749    */
00750   dst[i - 1] ^= src[i - 1] & ~mask_end;
00751 
00752   __M4RI_DD_ROW(M, dstrow);
00753 }
00754 
00766 void mzd_row_add(mzd_t *M, rci_t const sourcerow, rci_t const destrow);
00767 
00782 mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A);
00783 
00798 mzd_t *mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B);
00799 
00814 mzd_t *mzd_addmul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B);
00815 
00827 mzd_t *_mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B, int const clear);
00828 
00838 mzd_t *_mzd_mul_va(mzd_t *C, mzd_t const *v, mzd_t const *A, int const clear);
00839 
00850 void mzd_randomize(mzd_t *M);
00851 
00866 void mzd_set_ui(mzd_t *M, unsigned int const value);
00867 
00883 rci_t mzd_gauss_delayed(mzd_t *M, rci_t const startcol, int const full);
00884 
00900 rci_t mzd_echelonize_naive(mzd_t *M, int const full);
00901 
00911 int mzd_equal(mzd_t const *A, mzd_t const *B);
00912 
00926 int mzd_cmp(mzd_t const *A, mzd_t const *B);
00927 
00935 mzd_t *mzd_copy(mzd_t *DST, mzd_t const *A);
00936 
00957 mzd_t *mzd_concat(mzd_t *C, mzd_t const *A, mzd_t const *B);
00958 
00978 mzd_t *mzd_stack(mzd_t *C, mzd_t const *A, mzd_t const *B);
00979 
00992 mzd_t *mzd_submatrix(mzd_t *S, mzd_t const *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc);
00993 
01007 mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t const *A, mzd_t const *I);
01008 
01020 mzd_t *mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B);
01021 
01032 mzd_t *_mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B);
01033 
01044 #define mzd_sub mzd_add
01045 
01056 #define _mzd_sub _mzd_add
01057 
01058 
01059 
01069 static inline word mzd_read_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n) {
01070   int const spot = (y + M->offset) % m4ri_radix;
01071   wi_t const block = (y + M->offset) / m4ri_radix;
01072   int const spill = spot + n - m4ri_radix;
01073   word temp = (spill <= 0) ? M->rows[x][block] << -spill : (M->rows[x][block + 1] << (m4ri_radix - spill)) | (M->rows[x][block] >> spill);
01074   return temp >> (m4ri_radix - n);
01075 }
01076 
01077 
01097 void mzd_combine(mzd_t *DST, rci_t const row3, wi_t const startblock3,
01098                  mzd_t const *SC1, rci_t const row1, wi_t const startblock1, 
01099                  mzd_t const *SC2, rci_t const row2, wi_t const startblock2);
01100 
01101 
01121 static inline void mzd_combine_weird(mzd_t *C,       rci_t const c_row, wi_t const c_startblock,
01122                                      mzd_t const *A, rci_t const a_row, wi_t const a_startblock, 
01123                                      mzd_t const *B, rci_t const b_row, wi_t const b_startblock) {
01124   word tmp;
01125   rci_t i = 0;
01126 
01127 
01128   for(; i + m4ri_radix <= A->ncols; i += m4ri_radix) {
01129     tmp = mzd_read_bits(A, a_row, i, m4ri_radix) ^ mzd_read_bits(B, b_row, i, m4ri_radix);
01130     mzd_clear_bits(C, c_row, i, m4ri_radix);
01131     mzd_xor_bits(C, c_row, i, m4ri_radix, tmp);
01132   }
01133   if(A->ncols - i) {
01134     tmp = mzd_read_bits(A, a_row, i, (A->ncols - i)) ^ mzd_read_bits(B, b_row, i, (B->ncols - i));
01135     mzd_clear_bits(C, c_row, i, (C->ncols - i));
01136     mzd_xor_bits(C, c_row, i, (C->ncols - i), tmp);
01137   }
01138 
01139   __M4RI_DD_MZD(C);
01140 }
01141 
01158 static inline void mzd_combine_even_in_place(mzd_t *A,       rci_t const a_row, wi_t const a_startblock,
01159                                              mzd_t const *B, rci_t const b_row, wi_t const b_startblock) {
01160 
01161   wi_t wide = A->width - a_startblock - 1;
01162 
01163   word *a = A->rows[a_row] + a_startblock;
01164   word *b = B->rows[b_row] + b_startblock;
01165   
01166 #if __M4RI_HAVE_SSE2
01167   if(wide > __M4RI_SSE2_CUTOFF) {
01169     if (__M4RI_ALIGNMENT(a,16)) {
01170       *a++ ^= *b++;
01171       wide--;
01172     }
01173     
01174     if (__M4RI_ALIGNMENT(a, 16) == 0 && __M4RI_ALIGNMENT(b, 16) == 0) {
01175       __m128i *a128 = (__m128i*)a;
01176       __m128i *b128 = (__m128i*)b;
01177       const __m128i *eof = (__m128i*)((unsigned long)(a + wide) & ~0xFUL);
01178       
01179       do {
01180         *a128 = _mm_xor_si128(*a128, *b128);
01181         ++b128;
01182         ++a128;
01183       } while(a128 < eof);
01184       
01185       a = (word*)a128;
01186       b = (word*)b128;
01187       wide = ((sizeof(word) * wide) % 16) / sizeof(word);
01188     }
01189   }
01190 #endif // __M4RI_HAVE_SSE2
01191 
01192   if (wide > 0) {
01193     wi_t n = (wide + 7) / 8;
01194     switch (wide % 8) {
01195     case 0: do { *(a++) ^= *(b++);
01196     case 7:      *(a++) ^= *(b++);
01197     case 6:      *(a++) ^= *(b++);
01198     case 5:      *(a++) ^= *(b++);
01199     case 4:      *(a++) ^= *(b++);
01200     case 3:      *(a++) ^= *(b++);
01201     case 2:      *(a++) ^= *(b++);
01202     case 1:      *(a++) ^= *(b++);
01203     } while (--n > 0);
01204     }
01205   }
01206 
01207   *a ^= *b & __M4RI_LEFT_BITMASK(A->ncols%m4ri_radix);
01208 
01209   __M4RI_DD_MZD(A);
01210 }
01211 
01212 
01232 static inline void mzd_combine_even(mzd_t *C,       rci_t const c_row, wi_t const c_startblock,
01233                                     mzd_t const *A, rci_t const a_row, wi_t const a_startblock, 
01234                                     mzd_t const *B, rci_t const b_row, wi_t const b_startblock) {
01235 
01236   wi_t wide = A->width - a_startblock - 1;
01237   word *a = A->rows[a_row] + a_startblock;
01238   word *b = B->rows[b_row] + b_startblock;
01239   word *c = C->rows[c_row] + c_startblock;
01240   
01241   /* /\* this is a corner case triggered by Strassen multiplication */
01242   /*  * which assumes certain (virtual) matrix sizes  */
01243   /*  * 2011/03/07: I don't think this was ever correct *\/ */
01244   /* if (a_row >= A->nrows) { */
01245   /*   assert(a_row < A->nrows); */
01246   /*   for(wi_t i = 0; i < wide; ++i) { */
01247   /*     c[i] = b[i]; */
01248   /*   } */
01249   /* } else { */
01250 #if __M4RI_HAVE_SSE2
01251   if(wide > __M4RI_SSE2_CUTOFF) {
01253     if (__M4RI_ALIGNMENT(a,16)) {
01254       *c++ = *b++ ^ *a++;
01255       wide--;
01256     }
01257       
01258     if ( (__M4RI_ALIGNMENT(b, 16) | __M4RI_ALIGNMENT(c, 16)) == 0) {
01259       __m128i *a128 = (__m128i*)a;
01260       __m128i *b128 = (__m128i*)b;
01261       __m128i *c128 = (__m128i*)c;
01262       const __m128i *eof = (__m128i*)((unsigned long)(a + wide) & ~0xFUL);
01263       
01264       do {
01265         *c128 = _mm_xor_si128(*a128, *b128);
01266         ++c128;
01267         ++b128;
01268         ++a128;
01269       } while(a128 < eof);
01270       
01271       a = (word*)a128;
01272       b = (word*)b128;
01273       c = (word*)c128;
01274       wide = ((sizeof(word) * wide) % 16) / sizeof(word);
01275     }
01276   }
01277 #endif // __M4RI_HAVE_SSE2
01278 
01279   if (wide > 0) {
01280     wi_t n = (wide + 7) / 8;
01281     switch (wide % 8) {
01282     case 0: do { *(c++) = *(a++) ^ *(b++);
01283     case 7:      *(c++) = *(a++) ^ *(b++);
01284     case 6:      *(c++) = *(a++) ^ *(b++);
01285     case 5:      *(c++) = *(a++) ^ *(b++);
01286     case 4:      *(c++) = *(a++) ^ *(b++);
01287     case 3:      *(c++) = *(a++) ^ *(b++);
01288     case 2:      *(c++) = *(a++) ^ *(b++);
01289     case 1:      *(c++) = *(a++) ^ *(b++);
01290     } while (--n > 0);
01291     }
01292   }
01293   *c ^= ((*a ^ *b ^ *c) & __M4RI_LEFT_BITMASK(C->ncols%m4ri_radix));
01294 
01295   __M4RI_DD_MZD(C);
01296 }
01297 
01298 
01307 static inline int mzd_read_bits_int(mzd_t const *M, rci_t const x, rci_t const y, int const n)
01308 {
01309   return __M4RI_CONVERT_TO_INT(mzd_read_bits(M, x, y, n));
01310 }
01311 
01312 
01319 int mzd_is_zero(mzd_t const *A);
01320 
01329 void mzd_row_clear_offset(mzd_t *M, rci_t const row, rci_t const coloffset);
01330 
01347 int mzd_find_pivot(mzd_t const *M, rci_t start_row, rci_t start_col, rci_t *r, rci_t *c);
01348 
01349 
01362 double mzd_density(mzd_t const *A, wi_t res);
01363 
01378 double _mzd_density(mzd_t const *A, wi_t res, rci_t r, rci_t c);
01379 
01380 
01389 rci_t mzd_first_zero_row(mzd_t const *A);
01390 
01397 static inline unsigned long long mzd_hash(mzd_t const *A) {
01398   unsigned long long hash = 0;
01399   for (rci_t r = 0; r < A->nrows; ++r)
01400     hash ^= rotate_word(calculate_hash(A->rows[r], A->width), r % m4ri_radix);
01401   return hash;
01402 }
01403 
01413 mzd_t *mzd_extract_u(mzd_t *U, mzd_t const *A);
01414 
01424 mzd_t *mzd_extract_l(mzd_t *L, mzd_t const *A);
01425 
01426 #endif // M4RI_MZD