+2005-12-13 Richard Sandiford <richard@codesourcery.com>
+
+ * Make-lang.in (fortran/trans-resolve.o): Depend on
+ fortran/dependency.h.
+ * gfortran.h (gfc_expr): Add an "inline_noncopying_intrinsic" flag.
+ * dependency.h (gfc_get_noncopying_intrinsic_argument): Declare.
+ (gfc_check_fncall_dependency): Change prototype.
+ * dependency.c (gfc_get_noncopying_intrinsic_argument): New function.
+ (gfc_check_argument_var_dependency): New function, split from
+ gfc_check_fncall_dependency.
+ (gfc_check_argument_dependency): New function.
+ (gfc_check_fncall_dependency): Replace the expression parameter with
+ separate symbol and argument list parameters. Generalize the function
+ to handle dependencies for any type of expression, not just variables.
+ Accept a further argument giving the intent of the expression being
+ tested. Ignore intent(in) arguments if that expression is also
+ intent(in).
+ * resolve.c: Include dependency.h.
+ (find_noncopying_intrinsics): New function.
+ (resolve_function, resolve_call): Call it on success.
+ * trans-array.h (gfc_conv_array_transpose): Declare.
+ (gfc_check_fncall_dependency): Remove prototype.
+ * trans-array.c (gfc_conv_array_transpose): New function.
+ * trans-intrinsic.c (gfc_conv_intrinsic_function): Don't use the
+ libcall handling if the expression is to be evaluated inline.
+ Add a case for handling inline transpose()s.
+ * trans-expr.c (gfc_trans_arrayfunc_assign): Adjust for the new
+ interface provided by gfc_check_fncall_dependency.
+
2005-12-12 Steven G. Kargl <kargls@comcast.net>
PR fortran/25078
gt-fortran-trans-intrinsic.h
fortran/dependency.o: $(GFORTRAN_TRANS_DEPS) fortran/dependency.h
fortran/trans-common.o: $(GFORTRAN_TRANS_DEPS)
+fortran/resolve.o: fortran/dependency.h
}
+/* Some array-returning intrinsics can be implemented by reusing the
+ data from one of the array arguments. For example, TRANPOSE does
+ not necessarily need to allocate new data: it can be implemented
+ by copying the original array's descriptor and simply swapping the
+ two dimension specifications.
+
+ If EXPR is a call to such an intrinsic, return the argument
+ whose data can be reused, otherwise return NULL. */
+
+gfc_expr *
+gfc_get_noncopying_intrinsic_argument (gfc_expr * expr)
+{
+ if (expr->expr_type != EXPR_FUNCTION || !expr->value.function.isym)
+ return NULL;
+
+ switch (expr->value.function.isym->generic_id)
+ {
+ case GFC_ISYM_TRANSPOSE:
+ return expr->value.function.actual->expr;
+
+ default:
+ return NULL;
+ }
+}
+
+
/* Return true if the result of reference REF can only be constructed
using a temporary array. */
}
-/* Dependency checking for direct function return by reference.
- Returns true if the arguments of the function depend on the
- destination. This is considerably less conservative than other
- dependencies because many function arguments will already be
- copied into a temporary. */
+/* Return true if array variable VAR could be passed to the same function
+ as argument EXPR without interfering with EXPR. INTENT is the intent
+ of VAR.
+
+ This is considerably less conservative than other dependencies
+ because many function arguments will already be copied into a
+ temporary. */
+
+static int
+gfc_check_argument_var_dependency (gfc_expr * var, sym_intent intent,
+ gfc_expr * expr)
+{
+ gcc_assert (var->expr_type == EXPR_VARIABLE);
+ gcc_assert (var->rank > 0);
+
+ switch (expr->expr_type)
+ {
+ case EXPR_VARIABLE:
+ return (gfc_ref_needs_temporary_p (expr->ref)
+ || gfc_check_dependency (var, expr, NULL, 0));
+
+ case EXPR_ARRAY:
+ return gfc_check_dependency (var, expr, NULL, 0);
+
+ case EXPR_FUNCTION:
+ if (intent != INTENT_IN && expr->inline_noncopying_intrinsic)
+ {
+ expr = gfc_get_noncopying_intrinsic_argument (expr);
+ return gfc_check_argument_var_dependency (var, intent, expr);
+ }
+ return 0;
+
+ default:
+ return 0;
+ }
+}
+
+
+/* Like gfc_check_argument_var_dependency, but extended to any
+ array expression OTHER, not just variables. */
+
+static int
+gfc_check_argument_dependency (gfc_expr * other, sym_intent intent,
+ gfc_expr * expr)
+{
+ switch (other->expr_type)
+ {
+ case EXPR_VARIABLE:
+ return gfc_check_argument_var_dependency (other, intent, expr);
+
+ case EXPR_FUNCTION:
+ if (other->inline_noncopying_intrinsic)
+ {
+ other = gfc_get_noncopying_intrinsic_argument (other);
+ return gfc_check_argument_dependency (other, INTENT_IN, expr);
+ }
+ return 0;
+
+ default:
+ return 0;
+ }
+}
+
+
+/* Like gfc_check_argument_dependency, but check all the arguments in ACTUAL.
+ FNSYM is the function being called, or NULL if not known. */
int
-gfc_check_fncall_dependency (gfc_expr * dest, gfc_expr * fncall)
+gfc_check_fncall_dependency (gfc_expr * other, sym_intent intent,
+ gfc_symbol * fnsym, gfc_actual_arglist * actual)
{
- gfc_actual_arglist *actual;
+ gfc_formal_arglist *formal;
gfc_expr *expr;
- gcc_assert (dest->expr_type == EXPR_VARIABLE
- && fncall->expr_type == EXPR_FUNCTION);
- gcc_assert (fncall->rank > 0);
-
- for (actual = fncall->value.function.actual; actual; actual = actual->next)
+ formal = fnsym ? fnsym->formal : NULL;
+ for (; actual; actual = actual->next, formal = formal ? formal->next : NULL)
{
expr = actual->expr;
if (!expr)
continue;
- /* Non-variable expressions will be allocated temporaries anyway. */
- switch (expr->expr_type)
- {
- case EXPR_VARIABLE:
- if (!gfc_ref_needs_temporary_p (expr->ref)
- && gfc_check_dependency (dest, expr, NULL, 0))
- return 1;
- break;
-
- case EXPR_ARRAY:
- if (gfc_check_dependency (dest, expr, NULL, 0))
- return 1;
- break;
+ /* Skip intent(in) arguments if OTHER itself is intent(in). */
+ if (formal
+ && intent == INTENT_IN
+ && formal->sym->attr.intent == INTENT_IN)
+ continue;
- default:
- break;
- }
+ if (gfc_check_argument_dependency (other, intent, expr))
+ return 1;
}
return 0;
bool gfc_ref_needs_temporary_p (gfc_ref *);
-int gfc_check_fncall_dependency (gfc_expr *, gfc_expr *);
+gfc_expr *gfc_get_noncopying_intrinsic_argument (gfc_expr *);
+int gfc_check_fncall_dependency (gfc_expr *, sym_intent, gfc_symbol *,
+ gfc_actual_arglist *);
int gfc_check_dependency (gfc_expr *, gfc_expr *, gfc_expr **, int);
int gfc_is_same_range (gfc_array_ref *, gfc_array_ref *, int, int);
int gfc_dep_compare_expr (gfc_expr *, gfc_expr *);
/* True if it is converted from Hollerith constant. */
unsigned int from_H : 1;
+ /* True if the expression is a call to a function that returns an array,
+ and if we have decided not to allocate temporary data for that array. */
+ unsigned int inline_noncopying_intrinsic : 1;
union
{
#include "system.h"
#include "gfortran.h"
#include "arith.h" /* For gfc_compare_expr(). */
+#include "dependency.h"
/* Types used in equivalence statements. */
}
+/* Go through each actual argument in ACTUAL and see if it can be
+ implemented as an inlined, non-copying intrinsic. FNSYM is the
+ function being called, or NULL if not known. */
+
+static void
+find_noncopying_intrinsics (gfc_symbol * fnsym, gfc_actual_arglist * actual)
+{
+ gfc_actual_arglist *ap;
+ gfc_expr *expr;
+
+ for (ap = actual; ap; ap = ap->next)
+ if (ap->expr
+ && (expr = gfc_get_noncopying_intrinsic_argument (ap->expr))
+ && !gfc_check_fncall_dependency (expr, INTENT_IN, fnsym, actual))
+ ap->expr->inline_noncopying_intrinsic = 1;
+}
+
+
/************* Function resolution *************/
/* Resolve a function call known to be generic.
}
}
+ if (t == SUCCESS)
+ find_noncopying_intrinsics (expr->value.function.esym,
+ expr->value.function.actual);
return t;
}
if (resolve_actual_arglist (c->ext.actual) == FAILURE)
return FAILURE;
- if (c->resolved_sym != NULL)
- return SUCCESS;
-
- switch (procedure_kind (c->symtree->n.sym))
- {
- case PTYPE_GENERIC:
- t = resolve_generic_s (c);
- break;
+ t = SUCCESS;
+ if (c->resolved_sym == NULL)
+ switch (procedure_kind (c->symtree->n.sym))
+ {
+ case PTYPE_GENERIC:
+ t = resolve_generic_s (c);
+ break;
- case PTYPE_SPECIFIC:
- t = resolve_specific_s (c);
- break;
+ case PTYPE_SPECIFIC:
+ t = resolve_specific_s (c);
+ break;
- case PTYPE_UNKNOWN:
- t = resolve_unknown_s (c);
- break;
+ case PTYPE_UNKNOWN:
+ t = resolve_unknown_s (c);
+ break;
- default:
- gfc_internal_error ("resolve_subroutine(): bad function type");
- }
+ default:
+ gfc_internal_error ("resolve_subroutine(): bad function type");
+ }
+ if (t == SUCCESS)
+ find_noncopying_intrinsics (c->resolved_sym, c->ext.actual);
return t;
}
}
+/* Generate code to tranpose array EXPR by creating a new descriptor
+ in which the dimension specifications have been reversed. */
+
+void
+gfc_conv_array_transpose (gfc_se * se, gfc_expr * expr)
+{
+ tree dest, src, dest_index, src_index;
+ gfc_loopinfo *loop;
+ gfc_ss_info *dest_info, *src_info;
+ gfc_ss *dest_ss, *src_ss;
+ gfc_se src_se;
+ int n;
+
+ loop = se->loop;
+
+ src_ss = gfc_walk_expr (expr);
+ dest_ss = se->ss;
+
+ src_info = &src_ss->data.info;
+ dest_info = &dest_ss->data.info;
+
+ /* Get a descriptor for EXPR. */
+ gfc_init_se (&src_se, NULL);
+ gfc_conv_expr_descriptor (&src_se, expr, src_ss);
+ gfc_add_block_to_block (&se->pre, &src_se.pre);
+ gfc_add_block_to_block (&se->post, &src_se.post);
+ src = src_se.expr;
+
+ /* Allocate a new descriptor for the return value. */
+ dest = gfc_create_var (TREE_TYPE (src), "atmp");
+ dest_info->descriptor = dest;
+ se->expr = dest;
+
+ /* Copy across the dtype field. */
+ gfc_add_modify_expr (&se->pre,
+ gfc_conv_descriptor_dtype (dest),
+ gfc_conv_descriptor_dtype (src));
+
+ /* Copy the dimension information, renumbering dimension 1 to 0 and
+ 0 to 1. */
+ gcc_assert (dest_info->dimen == 2);
+ gcc_assert (src_info->dimen == 2);
+ for (n = 0; n < 2; n++)
+ {
+ dest_info->delta[n] = integer_zero_node;
+ dest_info->start[n] = integer_zero_node;
+ dest_info->stride[n] = integer_one_node;
+ dest_info->dim[n] = n;
+
+ dest_index = gfc_rank_cst[n];
+ src_index = gfc_rank_cst[1 - n];
+
+ gfc_add_modify_expr (&se->pre,
+ gfc_conv_descriptor_stride (dest, dest_index),
+ gfc_conv_descriptor_stride (src, src_index));
+
+ gfc_add_modify_expr (&se->pre,
+ gfc_conv_descriptor_lbound (dest, dest_index),
+ gfc_conv_descriptor_lbound (src, src_index));
+
+ gfc_add_modify_expr (&se->pre,
+ gfc_conv_descriptor_ubound (dest, dest_index),
+ gfc_conv_descriptor_ubound (src, src_index));
+
+ if (!loop->to[n])
+ {
+ gcc_assert (integer_zerop (loop->from[n]));
+ loop->to[n] = build2 (MINUS_EXPR, gfc_array_index_type,
+ gfc_conv_descriptor_ubound (dest, dest_index),
+ gfc_conv_descriptor_lbound (dest, dest_index));
+ }
+ }
+
+ /* Copy the data pointer. */
+ dest_info->data = gfc_conv_descriptor_data_get (src);
+ gfc_conv_descriptor_data_set (&se->pre, dest, dest_info->data);
+
+ /* Copy the offset. This is not changed by transposition: the top-left
+ element is still at the same offset as before. */
+ dest_info->offset = gfc_conv_descriptor_offset (src);
+ gfc_add_modify_expr (&se->pre,
+ gfc_conv_descriptor_offset (dest),
+ dest_info->offset);
+
+ if (dest_info->dimen > loop->temp_dim)
+ loop->temp_dim = dest_info->dimen;
+}
+
+
/* Return the number of iterations in a loop that starts at START,
ends at END, and has step STEP. */
void gfc_conv_expr_descriptor (gfc_se *, gfc_expr *, gfc_ss *);
/* Convert an array for passing as an actual function parameter. */
void gfc_conv_array_parameter (gfc_se *, gfc_expr *, gfc_ss *, int);
+/* Evaluate and transpose a matrix expression. */
+void gfc_conv_array_transpose (gfc_se *, gfc_expr *);
/* These work with both descriptors and descriptorless arrays. */
tree gfc_conv_array_data (tree);
/* Dependency checking for WHERE and FORALL. */
int gfc_check_dependency (gfc_expr *, gfc_expr *, gfc_expr **, int);
-/* Dependency checking for function calls. */
-int gfc_check_fncall_dependency (gfc_expr *, gfc_expr *);
/* Add pre-loop scalarization code for intrinsic functions which require
special handling. */
}
/* Check for a dependency. */
- if (gfc_check_fncall_dependency (expr1, expr2))
+ if (gfc_check_fncall_dependency (expr1, INTENT_OUT,
+ expr2->value.function.esym,
+ expr2->value.function.actual))
return NULL;
/* The frontend doesn't seem to bother filling in expr->symtree for intrinsic
name = &expr->value.function.name[2];
- if (expr->rank > 0)
+ if (expr->rank > 0 && !expr->inline_noncopying_intrinsic)
{
lib = gfc_is_intrinsic_libcall (expr);
if (lib != 0)
gfc_conv_intrinsic_bound (se, expr, 0);
break;
+ case GFC_ISYM_TRANSPOSE:
+ if (se->ss && se->ss->useflags)
+ {
+ gfc_conv_tmp_array_ref (se);
+ gfc_advance_se_ss_chain (se);
+ }
+ else
+ gfc_conv_array_transpose (se, expr->value.function.actual->expr);
+ break;
+
case GFC_ISYM_LEN:
gfc_conv_intrinsic_len (se, expr);
break;
+2005-12-13 Richard Sandiford <richard@codesourcery.com>
+ Victor Leikehman <LEI@il.ibm.com>
+
+ * m4/matmul.m4: Use a different order in the special case of a
+ transposed first argument.
+ * generated/matmul_c4.c, generated/matmul_c8.c, generated/matmul_c10.c,
+ * generated/matmul_c16.c, generated/matmul_i4.c, generated/matmul_i8.c,
+ * generated/matmul_i10.c, generated/matmul_r4.c, generated/matmul_r8.c
+ * generated/matmul_r10.c, generated/matmul_r16.c: Regenerated.
+
2005-12-10 Janne Blomqvist <jb@gcc.gnu.org>
* Makefile.am: Enable loop unrolling for matmul.
#if defined (HAVE_GFC_COMPLEX_10)
-/* This is a C version of the following fortran pseudo-code. The key
- point is the loop order -- we access all arrays column-first, which
- improves the performance enough to boost galgel spec score by 50%.
+/* The order of loops is different in the case of plain matrix
+ multiplication C=MATMUL(A,B), and in the frequent special case where
+ the argument A is the temporary result of a TRANSPOSE intrinsic:
+ C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
+ looking at their strides.
+
+ The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
- C = 0
- DO J=1,N
- DO K=1,COUNT
+ IF (.NOT.IS_TRANSPOSED(A)) THEN
+ C = 0
+ DO J=1,N
+ DO K=1,COUNT
+ DO I=1,M
+ C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ ELSE
+ DO J=1,N
DO I=1,M
- C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ S = 0
+ DO K=1,COUNT
+ S = S+A(I,K)+B(K,J)
+ C(I,J) = S
+ ENDIF
*/
extern void matmul_c10 (gfc_array_c10 * const restrict retarray,
}
}
}
- else
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ const GFC_COMPLEX_10 *restrict abase_x;
+ const GFC_COMPLEX_10 *restrict bbase_y;
+ GFC_COMPLEX_10 *restrict dest_y;
+ GFC_COMPLEX_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
+ else
+ {
+ const GFC_COMPLEX_10 *restrict abase_x;
+ const GFC_COMPLEX_10 *restrict bbase_y;
+ GFC_COMPLEX_10 *restrict dest_y;
+ GFC_COMPLEX_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
}
#endif
#if defined (HAVE_GFC_COMPLEX_16)
-/* This is a C version of the following fortran pseudo-code. The key
- point is the loop order -- we access all arrays column-first, which
- improves the performance enough to boost galgel spec score by 50%.
+/* The order of loops is different in the case of plain matrix
+ multiplication C=MATMUL(A,B), and in the frequent special case where
+ the argument A is the temporary result of a TRANSPOSE intrinsic:
+ C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
+ looking at their strides.
+
+ The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
- C = 0
- DO J=1,N
- DO K=1,COUNT
+ IF (.NOT.IS_TRANSPOSED(A)) THEN
+ C = 0
+ DO J=1,N
+ DO K=1,COUNT
+ DO I=1,M
+ C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ ELSE
+ DO J=1,N
DO I=1,M
- C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ S = 0
+ DO K=1,COUNT
+ S = S+A(I,K)+B(K,J)
+ C(I,J) = S
+ ENDIF
*/
extern void matmul_c16 (gfc_array_c16 * const restrict retarray,
}
}
}
- else
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ const GFC_COMPLEX_16 *restrict abase_x;
+ const GFC_COMPLEX_16 *restrict bbase_y;
+ GFC_COMPLEX_16 *restrict dest_y;
+ GFC_COMPLEX_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
+ else
+ {
+ const GFC_COMPLEX_16 *restrict abase_x;
+ const GFC_COMPLEX_16 *restrict bbase_y;
+ GFC_COMPLEX_16 *restrict dest_y;
+ GFC_COMPLEX_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
}
#endif
#if defined (HAVE_GFC_COMPLEX_4)
-/* This is a C version of the following fortran pseudo-code. The key
- point is the loop order -- we access all arrays column-first, which
- improves the performance enough to boost galgel spec score by 50%.
+/* The order of loops is different in the case of plain matrix
+ multiplication C=MATMUL(A,B), and in the frequent special case where
+ the argument A is the temporary result of a TRANSPOSE intrinsic:
+ C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
+ looking at their strides.
+
+ The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
- C = 0
- DO J=1,N
- DO K=1,COUNT
+ IF (.NOT.IS_TRANSPOSED(A)) THEN
+ C = 0
+ DO J=1,N
+ DO K=1,COUNT
+ DO I=1,M
+ C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ ELSE
+ DO J=1,N
DO I=1,M
- C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ S = 0
+ DO K=1,COUNT
+ S = S+A(I,K)+B(K,J)
+ C(I,J) = S
+ ENDIF
*/
extern void matmul_c4 (gfc_array_c4 * const restrict retarray,
}
}
}
- else
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ const GFC_COMPLEX_4 *restrict abase_x;
+ const GFC_COMPLEX_4 *restrict bbase_y;
+ GFC_COMPLEX_4 *restrict dest_y;
+ GFC_COMPLEX_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
+ else
+ {
+ const GFC_COMPLEX_4 *restrict abase_x;
+ const GFC_COMPLEX_4 *restrict bbase_y;
+ GFC_COMPLEX_4 *restrict dest_y;
+ GFC_COMPLEX_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
}
#endif
#if defined (HAVE_GFC_COMPLEX_8)
-/* This is a C version of the following fortran pseudo-code. The key
- point is the loop order -- we access all arrays column-first, which
- improves the performance enough to boost galgel spec score by 50%.
+/* The order of loops is different in the case of plain matrix
+ multiplication C=MATMUL(A,B), and in the frequent special case where
+ the argument A is the temporary result of a TRANSPOSE intrinsic:
+ C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
+ looking at their strides.
+
+ The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
- C = 0
- DO J=1,N
- DO K=1,COUNT
+ IF (.NOT.IS_TRANSPOSED(A)) THEN
+ C = 0
+ DO J=1,N
+ DO K=1,COUNT
+ DO I=1,M
+ C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ ELSE
+ DO J=1,N
DO I=1,M
- C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ S = 0
+ DO K=1,COUNT
+ S = S+A(I,K)+B(K,J)
+ C(I,J) = S
+ ENDIF
*/
extern void matmul_c8 (gfc_array_c8 * const restrict retarray,
}
}
}
- else
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ const GFC_COMPLEX_8 *restrict abase_x;
+ const GFC_COMPLEX_8 *restrict bbase_y;
+ GFC_COMPLEX_8 *restrict dest_y;
+ GFC_COMPLEX_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
+ else
+ {
+ const GFC_COMPLEX_8 *restrict abase_x;
+ const GFC_COMPLEX_8 *restrict bbase_y;
+ GFC_COMPLEX_8 *restrict dest_y;
+ GFC_COMPLEX_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
}
#endif
#if defined (HAVE_GFC_INTEGER_16)
-/* This is a C version of the following fortran pseudo-code. The key
- point is the loop order -- we access all arrays column-first, which
- improves the performance enough to boost galgel spec score by 50%.
+/* The order of loops is different in the case of plain matrix
+ multiplication C=MATMUL(A,B), and in the frequent special case where
+ the argument A is the temporary result of a TRANSPOSE intrinsic:
+ C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
+ looking at their strides.
+
+ The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
- C = 0
- DO J=1,N
- DO K=1,COUNT
+ IF (.NOT.IS_TRANSPOSED(A)) THEN
+ C = 0
+ DO J=1,N
+ DO K=1,COUNT
+ DO I=1,M
+ C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ ELSE
+ DO J=1,N
DO I=1,M
- C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ S = 0
+ DO K=1,COUNT
+ S = S+A(I,K)+B(K,J)
+ C(I,J) = S
+ ENDIF
*/
extern void matmul_i16 (gfc_array_i16 * const restrict retarray,
}
}
}
- else
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ const GFC_INTEGER_16 *restrict abase_x;
+ const GFC_INTEGER_16 *restrict bbase_y;
+ GFC_INTEGER_16 *restrict dest_y;
+ GFC_INTEGER_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
+ else
+ {
+ const GFC_INTEGER_16 *restrict abase_x;
+ const GFC_INTEGER_16 *restrict bbase_y;
+ GFC_INTEGER_16 *restrict dest_y;
+ GFC_INTEGER_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
}
#endif
#if defined (HAVE_GFC_INTEGER_4)
-/* This is a C version of the following fortran pseudo-code. The key
- point is the loop order -- we access all arrays column-first, which
- improves the performance enough to boost galgel spec score by 50%.
+/* The order of loops is different in the case of plain matrix
+ multiplication C=MATMUL(A,B), and in the frequent special case where
+ the argument A is the temporary result of a TRANSPOSE intrinsic:
+ C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
+ looking at their strides.
+
+ The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
- C = 0
- DO J=1,N
- DO K=1,COUNT
+ IF (.NOT.IS_TRANSPOSED(A)) THEN
+ C = 0
+ DO J=1,N
+ DO K=1,COUNT
+ DO I=1,M
+ C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ ELSE
+ DO J=1,N
DO I=1,M
- C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ S = 0
+ DO K=1,COUNT
+ S = S+A(I,K)+B(K,J)
+ C(I,J) = S
+ ENDIF
*/
extern void matmul_i4 (gfc_array_i4 * const restrict retarray,
}
}
}
- else
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ const GFC_INTEGER_4 *restrict abase_x;
+ const GFC_INTEGER_4 *restrict bbase_y;
+ GFC_INTEGER_4 *restrict dest_y;
+ GFC_INTEGER_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
+ else
+ {
+ const GFC_INTEGER_4 *restrict abase_x;
+ const GFC_INTEGER_4 *restrict bbase_y;
+ GFC_INTEGER_4 *restrict dest_y;
+ GFC_INTEGER_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
}
#endif
#if defined (HAVE_GFC_INTEGER_8)
-/* This is a C version of the following fortran pseudo-code. The key
- point is the loop order -- we access all arrays column-first, which
- improves the performance enough to boost galgel spec score by 50%.
+/* The order of loops is different in the case of plain matrix
+ multiplication C=MATMUL(A,B), and in the frequent special case where
+ the argument A is the temporary result of a TRANSPOSE intrinsic:
+ C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
+ looking at their strides.
+
+ The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
- C = 0
- DO J=1,N
- DO K=1,COUNT
+ IF (.NOT.IS_TRANSPOSED(A)) THEN
+ C = 0
+ DO J=1,N
+ DO K=1,COUNT
+ DO I=1,M
+ C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ ELSE
+ DO J=1,N
DO I=1,M
- C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ S = 0
+ DO K=1,COUNT
+ S = S+A(I,K)+B(K,J)
+ C(I,J) = S
+ ENDIF
*/
extern void matmul_i8 (gfc_array_i8 * const restrict retarray,
}
}
}
- else
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ const GFC_INTEGER_8 *restrict abase_x;
+ const GFC_INTEGER_8 *restrict bbase_y;
+ GFC_INTEGER_8 *restrict dest_y;
+ GFC_INTEGER_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
+ else
+ {
+ const GFC_INTEGER_8 *restrict abase_x;
+ const GFC_INTEGER_8 *restrict bbase_y;
+ GFC_INTEGER_8 *restrict dest_y;
+ GFC_INTEGER_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_INTEGER_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
}
#endif
#if defined (HAVE_GFC_REAL_10)
-/* This is a C version of the following fortran pseudo-code. The key
- point is the loop order -- we access all arrays column-first, which
- improves the performance enough to boost galgel spec score by 50%.
+/* The order of loops is different in the case of plain matrix
+ multiplication C=MATMUL(A,B), and in the frequent special case where
+ the argument A is the temporary result of a TRANSPOSE intrinsic:
+ C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
+ looking at their strides.
+
+ The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
- C = 0
- DO J=1,N
- DO K=1,COUNT
+ IF (.NOT.IS_TRANSPOSED(A)) THEN
+ C = 0
+ DO J=1,N
+ DO K=1,COUNT
+ DO I=1,M
+ C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ ELSE
+ DO J=1,N
DO I=1,M
- C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ S = 0
+ DO K=1,COUNT
+ S = S+A(I,K)+B(K,J)
+ C(I,J) = S
+ ENDIF
*/
extern void matmul_r10 (gfc_array_r10 * const restrict retarray,
}
}
}
- else
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ const GFC_REAL_10 *restrict abase_x;
+ const GFC_REAL_10 *restrict bbase_y;
+ GFC_REAL_10 *restrict dest_y;
+ GFC_REAL_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
+ else
+ {
+ const GFC_REAL_10 *restrict abase_x;
+ const GFC_REAL_10 *restrict bbase_y;
+ GFC_REAL_10 *restrict dest_y;
+ GFC_REAL_10 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_10) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
}
#endif
#if defined (HAVE_GFC_REAL_16)
-/* This is a C version of the following fortran pseudo-code. The key
- point is the loop order -- we access all arrays column-first, which
- improves the performance enough to boost galgel spec score by 50%.
+/* The order of loops is different in the case of plain matrix
+ multiplication C=MATMUL(A,B), and in the frequent special case where
+ the argument A is the temporary result of a TRANSPOSE intrinsic:
+ C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
+ looking at their strides.
+
+ The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
- C = 0
- DO J=1,N
- DO K=1,COUNT
+ IF (.NOT.IS_TRANSPOSED(A)) THEN
+ C = 0
+ DO J=1,N
+ DO K=1,COUNT
+ DO I=1,M
+ C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ ELSE
+ DO J=1,N
DO I=1,M
- C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ S = 0
+ DO K=1,COUNT
+ S = S+A(I,K)+B(K,J)
+ C(I,J) = S
+ ENDIF
*/
extern void matmul_r16 (gfc_array_r16 * const restrict retarray,
}
}
}
- else
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ const GFC_REAL_16 *restrict abase_x;
+ const GFC_REAL_16 *restrict bbase_y;
+ GFC_REAL_16 *restrict dest_y;
+ GFC_REAL_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
+ else
+ {
+ const GFC_REAL_16 *restrict abase_x;
+ const GFC_REAL_16 *restrict bbase_y;
+ GFC_REAL_16 *restrict dest_y;
+ GFC_REAL_16 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_16) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
}
#endif
#if defined (HAVE_GFC_REAL_4)
-/* This is a C version of the following fortran pseudo-code. The key
- point is the loop order -- we access all arrays column-first, which
- improves the performance enough to boost galgel spec score by 50%.
+/* The order of loops is different in the case of plain matrix
+ multiplication C=MATMUL(A,B), and in the frequent special case where
+ the argument A is the temporary result of a TRANSPOSE intrinsic:
+ C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
+ looking at their strides.
+
+ The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
- C = 0
- DO J=1,N
- DO K=1,COUNT
+ IF (.NOT.IS_TRANSPOSED(A)) THEN
+ C = 0
+ DO J=1,N
+ DO K=1,COUNT
+ DO I=1,M
+ C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ ELSE
+ DO J=1,N
DO I=1,M
- C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ S = 0
+ DO K=1,COUNT
+ S = S+A(I,K)+B(K,J)
+ C(I,J) = S
+ ENDIF
*/
extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
}
}
}
- else
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ const GFC_REAL_4 *restrict abase_x;
+ const GFC_REAL_4 *restrict bbase_y;
+ GFC_REAL_4 *restrict dest_y;
+ GFC_REAL_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
+ else
+ {
+ const GFC_REAL_4 *restrict abase_x;
+ const GFC_REAL_4 *restrict bbase_y;
+ GFC_REAL_4 *restrict dest_y;
+ GFC_REAL_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
}
#endif
#if defined (HAVE_GFC_REAL_8)
-/* This is a C version of the following fortran pseudo-code. The key
- point is the loop order -- we access all arrays column-first, which
- improves the performance enough to boost galgel spec score by 50%.
+/* The order of loops is different in the case of plain matrix
+ multiplication C=MATMUL(A,B), and in the frequent special case where
+ the argument A is the temporary result of a TRANSPOSE intrinsic:
+ C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
+ looking at their strides.
+
+ The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
- C = 0
- DO J=1,N
- DO K=1,COUNT
+ IF (.NOT.IS_TRANSPOSED(A)) THEN
+ C = 0
+ DO J=1,N
+ DO K=1,COUNT
+ DO I=1,M
+ C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ ELSE
+ DO J=1,N
DO I=1,M
- C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ S = 0
+ DO K=1,COUNT
+ S = S+A(I,K)+B(K,J)
+ C(I,J) = S
+ ENDIF
*/
extern void matmul_r8 (gfc_array_r8 * const restrict retarray,
}
}
}
- else
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ const GFC_REAL_8 *restrict abase_x;
+ const GFC_REAL_8 *restrict bbase_y;
+ GFC_REAL_8 *restrict dest_y;
+ GFC_REAL_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
+ else
+ {
+ const GFC_REAL_8 *restrict abase_x;
+ const GFC_REAL_8 *restrict bbase_y;
+ GFC_REAL_8 *restrict dest_y;
+ GFC_REAL_8 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_REAL_8) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
}
#endif
`#if defined (HAVE_'rtype_name`)'
-/* This is a C version of the following fortran pseudo-code. The key
- point is the loop order -- we access all arrays column-first, which
- improves the performance enough to boost galgel spec score by 50%.
+/* The order of loops is different in the case of plain matrix
+ multiplication C=MATMUL(A,B), and in the frequent special case where
+ the argument A is the temporary result of a TRANSPOSE intrinsic:
+ C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
+ looking at their strides.
+
+ The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
- C = 0
- DO J=1,N
- DO K=1,COUNT
+ IF (.NOT.IS_TRANSPOSED(A)) THEN
+ C = 0
+ DO J=1,N
+ DO K=1,COUNT
+ DO I=1,M
+ C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ ELSE
+ DO J=1,N
DO I=1,M
- C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ S = 0
+ DO K=1,COUNT
+ S = S+A(I,K)+B(K,J)
+ C(I,J) = S
+ ENDIF
*/
extern void matmul_`'rtype_code (rtype * const restrict retarray,
}
}
}
- else
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ const rtype_name *restrict abase_x;
+ const rtype_name *restrict bbase_y;
+ rtype_name *restrict dest_y;
+ rtype_name s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (rtype_name) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
+ else
+ {
+ const rtype_name *restrict abase_x;
+ const rtype_name *restrict bbase_y;
+ rtype_name *restrict dest_y;
+ rtype_name s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (rtype_name) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
}
#endif