/* The Berkeley UPC Runtime Specification
 * Version: 3.11
 * Copyright 2002-4, Dan Bonachea <bonachea@cs.berkeley.edu>
 * Portions Copyright 2002-14, E. O. Lawrence Berekely National Laboratory
 */

/* This file describes the interface between the platform-independent code
   generated by a UPC-to-C translating compiler, and the hand-written UPC
   runtime layer that implements the language on a given architecture

  Many/most of the operations below will be implemented using macros or inline functions in an actual 
    implementation (a number of design decisions in the interface were based on the expected optimizations 
     that will occur in such an implementation)
  They are specified using function declaration syntax below to make the types clear
  All correct generated code must type check using the definitions below
  In no case should client code assume it can create a "function pointer" to any of these operations

  Note this interface is meant primarily as a compilation target for a code generator, 
   not a library for hand-written code - as such, the goals of expressiveness and performance
   generally take precedence over readability and minimality
   
  Implementation-specific values in declarations are indicated using "???"
  Sections marked "Implementor's note" are recommendations to implementors and are 
    not part of the specification
 */

#include <inttypes.h>


/* ------------------------------------------------------------------------------------ */
/* UPC Runtime Types
   =================
  
   For more information on a particular type, see the relevant section of the
   specification below.
  
    upcr_thread_t		// UPC thread number
    upcr_shared_ptr_t		// pointer-to-shared 
    upcr_pshared_ptr_t		// phaseless pointer-to-shared (indefinitely blocked or blocksize=1)
    upcr_phase_t		// phase of pointer-to-shared
    upcr_register_value_t	// largest unsigned integer type that will fit in a CPU register
    upcr_handle_t		// handle for nonblocking operations
    upcr_valget_handle_t	// handle for nonblocking value get operations
    upcr_startup_shalloc_t	// Information struct for statically allocated shared data
    upcr_startup_pshalloc_t	// Information struct for statically allocated, phaseless shared data
    upcr_startup_arrayinit_diminfo_t // Initialization info for each dimension of a statically 
				     // allocated shared array
 */


/* ------------------------------------------------------------------------------------ */
/* Control Interface
   =================
 */

/*****************************************************************************
 * Runtime initialization functions.
 *
 * There are two sets of initialization functions for the Berkeley UPC
 * runtime:  one low-level set targeted at compiler developers, who want the
 * largest amount of control over behavior, and one 'simpler' interface,
 * targeted at application/library developers who wish to use UPC within a
 * larger, non-UPC C/C++ program (though it can also be used by UPC
 * compilers).  The simpler interface (bupc_init() andi bupc_init_reentrant())
 * uses the lower-level API, plus a set of 'magic' global variables provided
 * by the UPC linker, to provide the full set of information needed, while the
 * low-level API takes all needed information in the function parameters.
 */


 /*****************************************************************************
 * Low-level API: 
 *
 * If used, the low-level initialization functions must be called in the
 * following order:
 *
 *	upcr_startup_init()
 *	upcr_startup_attach()
 *	upcr_startup_spawn()
 *	upcr_exit()		// not always needed: see description
 */

/*
 * Bootstraps a UPC job and performs any system-specific setup required.
 *
 * Called by all applications that use the UPC runtime at startup to bootstrap
 * the job before any other processing takes place.  Must be called before
 * any calls to any other functions in this specification, with the
 * command-line parameters passed to main (argc/argv), which may be modified
 * or augmented by this call (and are thus not safe to use before this call).
 * The semantics of any code executing before the call to
 * `upcr_startup_init()' is implementation-specific (for example, it is
 * undefined whether `stdin/stdout/stderr' are functional, or even how many
 * nodes will run that code).
 *
 * If the application using the runtime requires that it be run with a
 * fixed number of UPC threads, pass the thread count in the
 * 'static_threadcnt' parameter, and the program will abort with a error
 * message if the provided value does not match the execution environment that
 * will be provided.  Pass <= 0 for applications that can run with a dynamic
 * number of UPC threads.  If pthreads are used, a positive integer must be
 * supplied for 'default_pthreads_per_proc';  otherwise, pass 0.
 *
 * The 'main_name' parameter should be passed the name of the user's main()
 * UPC function: it is used to help users find that symbol name when debugging.
 * You may pass NULL if this is not needed.
 *
 * Upon return from `upcr_startup_init()', all the nodes of the job will be
 * running, stdout/stderr will be functional, and the basic job environment
 * will be established, however the primary network resources may not yet have
 * been initialized.  The following runtime functions are the only ones that
 * may be called between `upcr_startup_init()' and `upcr_startup_attach()':
 *
 *         `upcr_mynode()'
 *         `upcr_nodes()'
 *         `gasnet_getMaxLocalSegmentSize()'
 *         `gasnet_getMaxGlobalSegmentSize()'
 *         `upcr_getenv()'
 *         `upcr_global_exit()'
 *
 * All other runtime calls are prohibited until after a successful
 * `upcr_startup_attach()'.
 *
 * `upcr_startup_init()' may fail with a fatal error and
 * implementation-defined message if the nodes of the job cannot be
 * successfully bootstrapped.
 *
 * This function may be called repeatedly, but only the first invocation will
 * have any effect.
 */
void upcr_startup_init(int *pargc, char ***pargv, 
		       upcr_thread_t static_threadcnt, 
		       upcr_thread_t default_pthreads_per_proc, 
		       const char * main_name);


#define UPCR_ATTACH_ENV_OVERRIDE	1
#define UPCR_ATTACH_REQUIRE_SIZE	2
#define UPCR_ATTACH_SIZE_WARN		4

/*
 * Initializes the UPC runtime's network system, including shared memory
 * regions.  This function must be called after upcr_startup_init(), but
 * before any of the other upcr_startup_ functions.
 *
 * The 'default_shared_size' parameter gives the default size to request for
 * each UPC thread's shared memory region.  
 *
 * The 'default_shared_offset' parameter specifies the minimum distance (in
 * bytes) to provide between the current end of the regular C heap (commonly
 * provided by sbrk(0)) and the beginning of the shared memory region.  On
 * some platforms this offset becomes the growth limit for the regular C heap
 * (and thus determines how much more memory malloc(), calloc(), etc. can
 * return before failing).  On most systems, it is irrelevant, and 0 should be
 * passed, since using a large offset may limit the size of the shared memory
 * region.
 *
 * Values for 'default_shared_size' and 'default_shared_offset' must be
 * multiples of UPCR_PAGESIZE.  Both parameters may each be overridden at run
 * time if the 'flags' parameter allows it (see below).  
 * 
 * The size and address of the shared region that is created for each node in
 * the application can be determined after this call with the
 * gasnet_getSegmentInfo() function.  The size of the shared segment is
 * guaranteed to be be no larger than the requested size times the number of
 * pthreads on the node (with pthreads==1 if pthreads are not being used).
 * The region can be smaller than the requested amount, unless
 * UPCR_ATTACH_REQUIRE_SIZE is passed in the 'flags' parameter or the
 * UPC_REQUIRE_SHARED_SIZE environment variable is set to a nonempty value.  
 *
 * The 'flags' parameter can contain one or more of the following values (OR
 * them together if multiple flags are used):
 *   
 *   UPCR_ATTACH_ENV_OVERRIDE
 *      - if passed, the function checks the process' environment for 
 *        UPC_SHARED_HEAP_SIZE and/or UPC_SHARED_HEAP_OFFSET.  If these are
 *        set to valid values (a number immediately followed by a 'MB' or
 *        'GB', for example '32MB' for 32 megabytes, or '4GB' for 4
 *        gigabytes), they override the default_shared_size and
 *        default_shared_offset values, respectively.
 *
 *   UPCR_ATTACH_REQUIRE_SIZE
 *      - if this flag is passed, the function will die with an error message
 *        printed to stderr if the allocated shared region on any node is
 *        smaller than the amount that was asked for times the number of
 *        pthreads.  Can be overridden at startup by setting the
 *        UPC_REQUIRE_SHARED_SIZE environment variable to 'yes' or 'no'.
 *
 *   UPCR_ATTACH_SIZE_WARN
 *	- if this flag is passed, the runtime will issue a warning to stderr
 *	  if a smaller shared memory segment than requested will be used.
 *	  Can be overridden at startup by setting the UPC_SIZE_WARN
 *	  environment variable to 'yes' or 'no'.
 *
 * If any errors are encountered during upcr_startup_attach, an error message
 * is printed and the job is aborted.
 */

void upcr_startup_attach(uintptr_t default_shared_size, 
			 uintptr_t default_shared_offset, 
			 int flags);

/*
 * Struct argument to upcr_startup_spawn.
 */
struct upcr_startup_spawnfuncs {
    void (*pre_spawn_init)();
    void (*per_pthread_init)();
    void (*cache_init)(void *start, uintptr_t len);
    void (*heap_init)(void * start, uintptr_t len);
    void (*static_init)(void *start, uintptr_t len);
    int  (*main_function)(int argc, char **argv);
};

/*
 * Completes runtime initialization, including launching of any additional
 * pthreads (if a pthreaded runtime is used), and running of the user's main()
 * function (if any).
 *
 * '*pargc' and '*pargv' will be passed to the 'main_function' in the
 * 'spawnfuncs' argument (if it is non-NULL).  The 'static_data_size'
 * parameter should have a nonzero value if and only if static shared data get
 * their own section of the shared memory segment, separate from the shared
 * heaps (in Berkeley upc static data are allocated off of the heap; GCCUPC
 * uses a separate segment), and this should be the size of the static data
 * for each UPC thread.  The 'default_cache_size' indicates how much shared
 * memory to reserve for caching (by default):  since caching is not yet
 * implemented, pass 0.
 *
 * The 'spawnfuncs' parameter is a struct containing pointers to six
 * functions.
 *
 * The 'pre_spawn_init' function, if not NULL, is called first, before any
 * pthreads are launched.  It can contain any arbitrary initializations that
 * should happen only once per-process.
 *
 * Each of the remaining function pointers is called once on each UPC thread.  
 *
 * The 'per_pthread_init' function, if not NULL, is called by each pthread,
 * and can contain arbitrary initializations that need to happen on a
 * per-pthread basis.
 *
 * The 'cache_init' function is called next, but only if caching is being
 * used (i.e. if UPCR_USING_CACHING is defined).  It may be set to NULL
 * otherwise.  It must initialize the cache within the given region.  
 *
 * The 'heap_init' function is called next, and must initialize the runtime
 * shared heaps.  It is passed parameters indicating the starting address and
 * length of the region to use for the heap.  
 *
 * The 'static_init' function is then called.  It must set up all static data
 * for the UPC thread, and is passed the address and length of the segment to be
 * used. The length passed is guaranteed to be at least as large as provided in the
 * 'static_data_size' parameter, and the locations in the region are guaranteed to
 * have lower virtual addresses than the local addresses for any shared data
 * with affinity to this thread allocated using the dynamic shared memory
 * allocation functions. 
 *
 * Next, a barrier is performed.  Finally, if the 'main_function' parameter is
 * NULL, the function returns (and upcr_exit() should be used for any program
 * exit path, including the end of 'main').  Otherwise 'main_function' is
 * called with the command line arguments passed in 'argv' and 'argc' (with a
 * new copy made for each pthread if pthreads are used).  Again, upcr_exit()
 * should be used for any exit paths, except that returns from 'main_function'
 * are handled automatically, with the return value used as the program's exit
 * code.
 *
 * If 'main_function' != NULL, this function never returns.
 *
 * If any errors occur during this function, an error message is printed to
 * stderr and the job is terminated.
 *
 */
void upcr_startup_spawn(int *pargc, char ***pargv,
			uintptr_t static_data_size, 
			uintptr_t default_cache_size, 
			struct upcr_startup_spawnfuncs *spawnfuncs);


/* Runtime shutdown/exit function.
 *
 * This function should be called as the last program statement for all exit
 * paths from a UPC application, with the single exception that the
 * 'main_function' used by upcr_startup_spawn() may simply return an integer,
 * in which case the behavior is the same as if a call to this function had
 * been made with that value.
 *
 * The behavior of any code called after this function is undefined (i.e. it
 * may not execute).
 */
void upcr_exit(int exitcode);


/*****************************************************************************
 * Framework for external bootstrapping of the UPC runtime.
 *
 * The 'bupc_init() and 'bupc_init_reentrant()' functions allow 'external'
 * bootstrapping of the UPC runtime, i.e., initialization of the runtime by
 * programs which are not written entirely in UPC, and whose main() does not
 * appear in a UPC file. 
 *
 * To provide the full amount of needed data to the runtime, these functions
 * require a set of 'magic' global variables to be set by the Berkeley UPC
 * linker (upcc).  
 *****************************************************************************/

/*
 * Public, user-accessible function for bootstrapping the Berkeley UPC runtime
 * from a non-UPC C or C++ program that does not use pthreads. 
 *
 * A call to this function should be the first statement in main(). The
 * semantics of any code appearing before it is implementation-defined (for
 * example, it is undefined how many threads of control will run that code, or
 * whether stdin/stdout/stderr are functional).  The presence of environment
 * variables is also not guaranteed, but after this call returns bupc_getenv()
 * can be used to retrieve them (regular getenv() is not guaranteed to provide
 * them).
 *
 * The addresses of the command-line parameters must be passed, and it is not
 * safe to otherwise refer to them until after this function returns, as it
 * may supplement or modify them.
 *
 * Once bupc_init() has returned, the application may safely call into UPC
 * routines.  All exit paths from the program should call bupc_exit() as
 * their last program statement.  
 *
 * If any errors are encountered during this function's execution, an error
 * message is printed to stderr and the job will be aborted.
 *
 * This call may register UNIX signal handlers.  Client code should not
 * register signal handlers or rely on the correct propagation of signals.
 *
 * This function cannot be used with a pthreaded application.  Use
 * bupc_init_reentrant() instead.
 *
 * This function may be called repeatedly, but only the first invocation will
 * have any effect.
 *
 * If used within a hybrid MPI/UPC program, this function also ensures that
 * MPI_Init() is called, if needed.  MPI_Init() should NOT be called by user
 * code if this function is used.
 */
void bupc_init(int *argc, char ***argv); 

/*
 * A portable version of bupc_init(). A call to the bupc_init_reentrant()
 * function will initialize the Berkeley UPC runtime, regardless of whether
 * pthreads are used or not.
 *
 * In addition to the addresses of the regular main() command-line
 * parameters, this function takes a function pointer.  Calling
 * bupc_init_reentrant() will cause all the pthreads known to the UPC runtime
 * to be launched, and each of them will then call the 'pmain_func()' with
 * their own copy of the command-line parameters.  'pmain_func' may not be
 * NULL.
 *
 * Like with bupc_init(), bupc_exit() should be called at the end of all
 * program exit paths, except for returns from 'pmain_func'.  If
 * 'pmain_func' returns, its return value is used to indicate the exit code
 * of the program, and the UPC runtime will exit correctly without an explicit
 * call to bupc_exit() being required.
 *
 * No meaningful code should follow this function call, as it exits before
 * returning.
 *
 * Within pmain_func(), user code may call into UPC routines.  It is only safe
 * to access UPC routines from the original pthread(s) whose pmain_func() is
 * called, however.  If additional pthreads are launched by the user
 * application, they must not call UPC routines, or behavior is undefined.
 *
 * Within pmain_func, bupc_getenv() can be used to retrieve values of
 * environment variables (regular getenv() is not guranteed to provide them).
 *
 * If any errors are encountered during this function's execution, an error
 * message is printed to stderr and the job will be aborted.
 *
 * This call may register UNIX signal handlers.  Client code should not
 * register signal handlers or rely on the correct propagation of signals.
 *
 * This function may be called repeatedly, but only the first invocation will
 * have any effect.
 *
 * If used within a hybrid MPI/UPC program, this function also ensures that
 * MPI_Init() is called, if needed.  MPI_Init() should NOT be called by user
 * code if this function is used.
 *
 * This function can also be used by UPC compilers to bootstrap a UPC job, if
 * the user's 'main' function is renamed and passed in as the 'pmain_func'
 * parameter.
 */
void bupc_init_reentrant(int *argc, char ***argv, 
			 int (*pmain_func)(int, char **) ); 


/* Retrieve value of an environment variable.  This function should be used
 * instead of getenv(), which is not guaranteed to return correct
 * results.  It can only be called by threads launched by the UPC runtime
 * (i.e., not pthreads that have been launched by the user's own
 * pthread_create() calls), and cannot be called until either bupc_init() or
 * bupc_init_reentrant() has been called first.
 *
 * At present this function is only guaranteed to retrieve values
 * for environment variables with names beginning with 'UPC_' or
 * 'GASNET_'.
 *
 * The 'setenv()' and 'unsetenv' functions are not guaranteed to work in a
 * Berkeley UPC runtime environment, and should be avoided.
 */
char * bupc_getenv(const char *env_name);

/*
 * Runtime shutdown/exit routine.
 *
 * This function should be called as the last program statement by any program
 * that uses bupc_init() to bootstrap the UPC runtime.  It does not need to be
 * used when bupc_init_reentrant() is used.  The 'exitcode' provided will be
 * returned to the console that invoked the job, assuming all of the threads
 * terminate with this function, and use the same exitcode.  If different
 * threads of the program exit with different values, one of the values will
 * be chosen arbitrarily.  The behavior of any program statements after a call
 * to bupc_exit() is undefined.
 *
 * If used within a hybrid MPI/UPC program, bupc_exit() ensures that
 * MPI_Finalize() is called, if needed.  MPI_Finalize should NOT be called
 * by user code if this function is used.
 */
void bupc_exit(int exitcode);


/*
 * "Magic" variables that must appear in the linked executable to support use
 * of the bupc_init() and/or bupc_init_reentrant() functions. 
 *
 * Definitions of all variables with the 'UPCRL_' prefix must be provided by
 * client code.  NULL/zero values can be used if system does not support
 * creating executables that call UPC functions from within a non-UPC C or C++
 * program.  
 */

/* Set to 0 if dynamic threads used, else to the static UPC thread count */
extern upcr_thread_t	UPCRL_static_thread_count; 

/* Default size of shared memory segment and offset */
extern uintptr_t	UPCRL_default_shared_size;
extern uintptr_t	UPCRL_default_shared_offset;

/* Support for systems which store shared variables in a separate linker
 * section. 
 *
 * Some systems (ex: GNU UPC) convert 'shared' static data into a separate
 * linker section.  In this case, the values stored in pointers-to-shared are
 * within that linker section (since they are assigned by the linker).  
 *
 * To work with Berkeley UPC, the linker section must be mapped into a portion
 * of the shared region provided by gasnet.  Also, if pthreads are used, a
 * separate copy of the linker section must exist for each pthread.   
 *
 * These requirements are handled by the runtime so long as
 * UPCR_USING_LINKADDRS is defined, and the beginning/ending addresses of the
 * linker section are provided in 'UPCRL_shared_begin' and 'UPCRL_shared_end'.
 * The runtime uses these addresses to make a copy for each pthread of the
 * linker section.  Then, during each shared <=> local address conversion, an
 * offset is used to convert between the linker-assigned address for a given
 * pointer-to-shared and its the address within a pthread's copy of the static
 * data region.
 * 
 * On ELF-based systems, the beginning and ending addresses are typically
 * provided by arranging for the UPCRL_shared_begin/end to be the first and
 * last variables in the linker section that the linker sees (on most linker
 * this can be achieved by putting the symbols in separate 'first.o' and
 * 'last.o' object files that are then passed to the linker as the first and
 * last objects on the linker command line).
 */
#ifdef UPCR_USING_LINKADDRS
  extern char UPCRL_shared_begin[1];
  extern char UPCRL_shared_end[1];
#endif

/* Nonzero to launch an extra progress thread.  Primarily useful for parallel
 * debugger support.  Redundant on some networks, and likely to slow down
 * performance on the others */
extern int              UPCRL_progress_thread;

/* Default size of runtime cache, if used. */
extern uintptr_t	UPCRL_default_cache_size;

/* default flags to pass to upcr_attach, if upcr_startup_init() is used to
 * bootstrap the runtime */
extern int		UPCRL_attach_flags;

/* default pthreads per process: pass 0 if not using pthreads */
extern upcr_thread_t	UPCRL_default_pthreads_per_node;

/* Name used to rename user's main() function
 * - optional: may be set to null. */
extern const char *	UPCRL_main_name;

/* Hook for arbitrary per-process initializations */
extern void	      (*UPCRL_pre_spawn_init)();

/* Hook for arbitrary per-pthread initializations */
extern void	      (*UPCRL_per_pthread_init)();

/* Cache initialization function to pass to upcr_startup_attach()
 * - Implementation note: upcc uses 'upcri_init_cache', and this can be used
 *   by other systems. */
extern void (*UPCRL_cache_init)(void *start, uintptr_t len);

/* Heap initialization function to pass to upcr_startup_attach():
 * - Implementation note: upcc uses 'upcri_init_heaps', and this can be used
 *   other systems. */
extern void (*UPCRL_heap_init)(void * start, uintptr_t len);

/* Static data initialization function to pass to upcr_startup_attach()
 * - Implementation note: upcc uses a function generated at link time for
 *   this. */
extern void (*UPCRL_static_init)(void *start, uintptr_t len);

/* Function to ensure MPI has been initialized.  Use only if both MPI and a
 * gasnet conduit are being used, else set to NULL. This function must not
 * call MPI_Init is it has already been called by gasnet (use
 * MPI_Initialized() to check).  No other code in the application should call
 * MPI_Init(), else behavior is undefined. */
extern void (*UPCRL_mpi_init)(int *pargc, char ***pargv);

/* Function to ensure MPI is shut down at program completion. Use only if both
 * MPI and a gasnet conduit are being used, else set to NULL.  MPI_Finalize
 * should only be called if the UPCRL_mpi_init function called
 * MPI_Initialize().  No other code in the application should call
 * MPI_Finalize(), else behavior is undefined. */
extern void (*UPCRL_mpi_finalize)();


/* terminate the current job with a given exit code - non-collective operation
   this function may be called by any thread at any time after initialization and will cause the 
     system to flush all I/O, release all resources and terminate the job for all active threads
   this function is called automatically by the runtime system in the event of any
     fatal error or catchable terminate-the-program signals (e.g. segmentation fault)
   this function must be called at the end of main() after a barrier to ensure proper system exit
   the console which initiated the current job will receive the provided exitcode 
     as a program return value in a system-specific way
   if more than one thread calls upcr_global_exit() within a given synchronization phase
     with different exitcode values, the value returned to the console will be one of the 
     provided exit codes (chosen arbitrarily)
   Implementation notes:
     gasnet may send a fatal signal to indicate a remote node exited or crashed
     calls gasnet_exit to terminate the job on remote nodes
*/
void upcr_global_exit(int exitcode);

/* UPCR_BEGIN_FUNCTION() - this declaration must appear at the very beginning of every function
   (before any declarations) in generated code that intends to call any of the entry points  
   provided by this API. It provides the runtime system with a place for minimal per-function 
   initialization that may be necessary on some platforms, particularly when pthreads are used
*/
#define UPCR_BEGIN_FUNCTION() ???

/* UPCR_EXIT_FUNCTION() - this declaration must appear immediately before every
   exit point from any function that intends to call any of the entry points  
   provided by this API. It provides the runtime system with information for
   hooking events (temporary object cleanup, profiling) that need to occur at function
   exit.
*/
#define UPCR_EXIT_FUNCTION() ???

/* UPC thread number:  this is an unsigned integral type used to represent the
   0-to-(N-1) thread numbers of UPC threads within an application.  The size of this type may 
   vary depending on the pointer-to-shared representation used.
 */
typedef ??? upcr_thread_t;

/* Job Layout Queries - Interrogate thread information
*/
upcr_thread_t upcr_mythread(); /* returns a 0-based UPC thread index */
upcr_thread_t upcr_threads();  /* returns the number of UPC threads in the system */

/* When pthreads are used, UPC threads may be >= gasnet nodes. */
upcr_thread_t upcr_mynode();   /* returns a 0-based GASNet node index */
upcr_thread_t upcr_nodes();    /* returns the number of GASNet nodes in the system */

/* ------------------------------------------------------------------------------------ */
/*
  System parameters
  =================
  Provided by the runtime system implementation to describe the runtime environment
  Most of this information is probably also made available to the UPC translator
  at UPC-to-C compile-time (by some mechanism not specified here), 
  but some compilers may simply wish to generate generic code that compiles to have the 
  correct behavior at C compile time using these preprocessor symbols
*/
#define UPCR_MAX_BLOCKSIZE   ???
#define UPCR_MAX_THREADS     ???

/* Implementors note: 
  all code should be written such that UPCR_MAX_THREADS can simply be changed (up to 2^31-1) and 
    the system recompiled to increase the thread limit
  all code should be written such that UPCR_MAX_BLOCKSIZE can simply be changed 
    (along with a possible change to the type used to represent phase in upcr_shared_t) 
    and the system recompiled to increase the block size limit
*/

/* UPCR_PLATFORM_ENVIRONMENT provides the platform-independent UPC compiler with
 *  some clues about the memory layout of the current platform to aid optimization
 *  trade-offs. 
 * The possible configuration values are:
 * UPCR_PURE_SHARED - purely shared memory, remote memory accesses are handled entirely 
 *  by hardware with no software interpretation overhead
 * UPCR_PURE_DISTRIBUTED - purely distributed memory, remote memory accesses are handled by some 
 *  software networking layer
 * UPCR_SHARED_DISTRIBUTED - a mixture of the above - some remote memory accesses are handled 
 *  by hardware, others by a software networking layer
 * UPCR_OTHER - any configuration not captured by the above options
 */
#define UPCR_PLATFORM_ENVIRONMENT ???

/* size of memory page on operating system, in bytes */
#define UPCR_PAGESIZE ???

/* ------------------------------------------------------------------------------------ */
/*
  Pointer-to-Shared Representation
  ================================

  *** upcr_shared_ptr_t - general pointer-to-shared
  *** upcr_pshared_ptr_t - "phase-less" pointer-to-shared, blocksize == 1 or blocksize indef

  opaque types representing a generic (i.e. untyped) pointer-to-shared defined by upcr 
  and used by generated code. In general, generated code NEVER looks inside 
  this opaque type, but there may be cases where we want to expose some 
  information to the UPC optimizer.

  Note these two pointer-to-shared categories are NOT interchangeable - the generated code
  must explicitly select the correct category pointer for the current static blocksize and 
  call the correct version of the appropriate entry points below
*/
typedef ??? upcr_shared_ptr_t;
typedef ??? upcr_pshared_ptr_t;

/*  
  Pointer-to-shared phase:  represents the phase of a pointer-to-shared, i.e., the index of the current 
  element in the current block of shared memory.  This is an unsigned integral type, whose size 
  may vary depending on the pointer-to-shared implementation.  
 */
typedef ??? upcr_phase_t;


/* Implementation Notes:
    The contents of these typedefs is NOT part of the specification and will
    vary with implementation. Therefore, the fields shown should NOT be accessed 
    by the generated code or compiler 
      typedef struct {
        uintptr_t _localaddr;      // make this the first field to speed pointer use
        unsigned short _threadid;  // use shorts so the entire struct fits in 2 words 
        unsigned short _phase;
      } upcr_shared_ptr_t

      typedef struct {
        uintptr_t _localaddr;      // make this the first field to speed pointer use
        short _threadid;           // use shorts so the entire struct fits in 2 words
      } upcr_pshared_ptr_t
*/
/* ------------------------------------------------------------------------------------ */
/*
  Pointer-to-Shared Manipulation
  ==============================
*/

/* Convert a pointer-to-shared with affinity to the current thread 
     into a local pointer. 
   If sptr does not have affinity to the calling thread the 
     result is implementation-specific
*/
void *upcr_shared_to_local(upcr_shared_ptr_t sptr);
void *upcr_pshared_to_local(upcr_pshared_ptr_t sptr);

/* Convert a pointer-to-shared into a virtual address usable by the calling thread
   The pointer target must refer to shared memory with affinity to the calling thread, 
   or otherwise to shared memory which the calling thread has the ability to access 
   directly via load/store to virtual memory, otherwise the call is erroneous.
   The extent of memory which falls into the latter category is implementation-dependent
   and may be empty. Furthermore, the virtual addresses returned by this function
   are only guaranteed to be valid on the calling thread.
*/
void *upcr_shared_to_processlocal(upcr_shared_ptr_t sptr);
void *upcr_pshared_to_processlocal(upcr_pshared_ptr_t sptr);

/* Convert a local ptr into the current thread's shared memory space into 
   a pointer-to-shared appropriate for use in remote operations from other threads. 
   The phase field is set to zero. Some implementations may issue an error if lptr 
   does not point into the shared region for the current thread. 
   Note this operation is not accessible from the UPC source level, but may be useful 
    for generated code nonetheless (e.g. to support a debugger)
   The _ref versions modify a pointer-to-shared in place rather than returning a 
    pointer-to-shared value, which may be more efficient in some implementations
*/
upcr_shared_ptr_t upcr_local_to_shared(void *lptr);
void upcr_local_to_shared_ref(void *lptr, upcr_shared_ptr_t *result);

upcr_pshared_ptr_t upcr_local_to_pshared(void *lptr);
void upcr_local_to_pshared_ref(void *lptr, upcr_pshared_ptr_t *result);

/* Same as above, but sets the phase and thread to a particular value. 
   phase is expressed in number of elements
 */

upcr_shared_ptr_t upcr_local_to_shared_withphase(void *lptr, upcr_phase_t phase, upcr_thread_t threadid);

void upcr_local_to_shared_ref_withphase(void *lptr, upcr_phase_t phase, upcr_thread_t threadid, 
					upcr_shared_ptr_t *result);

/* Convert back and forth between shared and pshared representations
    upcr_pshared_to_shared sets phase to zero
   The _ref versions modify a pointer-to-shared in place rather than returning a 
    pointer-to-shared value, which may be more efficient in some implementations
*/

upcr_pshared_ptr_t upcr_shared_to_pshared(upcr_shared_ptr_t sptr);
void upcr_shared_to_pshared_ref(upcr_shared_ptr_t sptr, upcr_pshared_ptr_t *result);

upcr_shared_ptr_t upcr_pshared_to_shared(upcr_pshared_ptr_t sptr);
void upcr_pshared_to_shared_ref(upcr_pshared_ptr_t sptr, upcr_shared_ptr_t *result);


/* Same as above, but sets the phase to a particular value. 
   phase is expressed in number of elements
*/

upcr_shared_ptr_t upcr_pshared_to_shared_withphase(upcr_pshared_ptr_t sptr, upcr_phase_t phase);

void upcr_pshared_to_shared_ref_withphase(upcr_pshared_ptr_t sptr, upcr_phase_t phase, 
					  upcr_shared_ptr_t *result);

/* reset the phase field of a given pointer-to-shared to zero
   (used for casting between block sizes)
 */

upcr_shared_ptr_t upcr_shared_resetphase(upcr_shared_ptr_t sptr);
void upcr_shared_resetphase_ref(upcr_shared_ptr_t *sptr);

/* Returns the thread number that has affinity to the given pointer-to-shared, 
   or 0 for a NULL pointer-to-shared. If sptr is not a valid pointer-to-shared, 
   the results are undefined.
*/

upcr_thread_t upcr_threadof_shared(upcr_shared_ptr_t sptr);
upcr_thread_t upcr_threadof_pshared(upcr_pshared_ptr_t sptr);

/* Returns the phase field of the given pointer-to-shared, 
   Returns 0 for a NULL pointer-to-shared or any phaseless pointer-to-shared
   phase is expressed in number of elements
 */

upcr_phase_t upcr_phaseof_shared(upcr_shared_ptr_t sptr);
upcr_phase_t upcr_phaseof_pshared(upcr_pshared_ptr_t sptr); /* always returns zero */

/* Returns an implementation-defined value reflecting the local address 
   of the object pointed to. This may or may not be the actual virtual address 
   where the object is stored - use upcr_to_local() when casting pointers-to-shared 
   to local pointers.
*/
uintptr_t upcr_addrfield_shared(upcr_shared_ptr_t sptr);
uintptr_t upcr_addrfield_pshared(upcr_pshared_ptr_t sptr);

/* upcr_affinitysize calculates the exact size of the local portion of the data
   in a shared object with affinity to a given thread, specified by threadid.
   totalsize should be the total number of bytes in the shared object.
   nbytes is the block size in BYTES.
*/
size_t upcr_affinitysize(size_t totalsize, size_t nbytes, upcr_thread_t threadid);

/* return non-zero iff the given pointer-to-shared is a null reference */

int upcr_isnull_shared(upcr_shared_ptr_t sptr);
int upcr_isnull_pshared(upcr_pshared_ptr_t sptr);

/* Return non-zero iff the given pointer is not valid, i.e., is not NULL, and
 * does not point to a valid shared memory address on some thread */
int upcr_isvalid_shared(upcr_shared_ptr_t *p);
int upcr_isvalid_pshared(upcr_pshared_ptr_t *p);

/* Set a pointer-to-shared to NULL. */

int upcr_setnull_shared(upcr_shared_ptr_t *p);
int upcr_setnull_pshared(upcr_pshared_ptr_t *p);

/* Pointer-to-shared increments/decrements - 
  add a positive or negative displacement to a pointer-to-shared. 
  Both the inc and blockelems arguments should be expressed in number of elements
  elemsz is the target element size in bytes
  The "add" versions return an updated pointer-to-shared, 
  the "inc" versions modify the input pointer-to-shared in place. 

  Pointers with a definite static blocksize > 1 should use the "shared" version, 
  pointers-to-shared with indef blocksize use the "psharedI" version
  pointers-to-shared with blocksize == 1 use the "pshared1" version
*/
upcr_shared_ptr_t upcr_add_shared(upcr_shared_ptr_t sptr,   size_t elemsz, ptrdiff_t inc, size_t blockelems);
void              upcr_inc_shared(upcr_shared_ptr_t *psptr, size_t elemsz, ptrdiff_t inc, size_t blockelems);

upcr_pshared_ptr_t upcr_add_psharedI(upcr_pshared_ptr_t sptr,   size_t elemsz, ptrdiff_t inc);
void               upcr_inc_psharedI(upcr_pshared_ptr_t *psptr, size_t elemsz, ptrdiff_t inc);

upcr_pshared_ptr_t upcr_add_pshared1(upcr_pshared_ptr_t sptr,   size_t elemsz, ptrdiff_t inc);
void               upcr_inc_pshared1(upcr_pshared_ptr_t *psptr, size_t elemsz, ptrdiff_t inc);

/* return non-zero iff ptr1 and ptr2 are both null, 
   or if they currently reference the same memory location
 */
int upcr_isequal_shared_shared(upcr_shared_ptr_t ptr1, upcr_shared_ptr_t ptr2);
int upcr_isequal_shared_pshared(upcr_shared_ptr_t ptr1, upcr_pshared_ptr_t ptr2);
int upcr_isequal_pshared_pshared(upcr_pshared_ptr_t ptr1, upcr_pshared_ptr_t ptr2);
int upcr_isequal_shared_local(upcr_shared_ptr_t ptr1, void *ptr2);
int upcr_isequal_pshared_local(upcr_pshared_ptr_t ptr1, void *ptr2);

/* Comparison and subtraction of pointers-to-shared -
  Compare pointers-to-shared sptr1 and sptr2 and calculate sptr1 - sptr2.
    blockelems is the block size for both ptrs, expressed in num elements
      (UPC type compatibility semantics require both pointers have the same blocksize)
    elemsz is the target element size in bytes

  Pointers with a definite static blocksize > 1 should use the "shared" version, 
  pointers-to-shared with indef blocksize use the "psharedI" version
  pointers-to-shared with blocksize == 1 use the "pshared1" version
  
 There are three possible cases:
  returns 0 if sptr1 and sptr2 currently reference the same memory cell (i.e. upcr_isequal() would return true)
  returns a positive or negative value N (an element count) to indicate that 
    upcr_add_shared(sptr2, elemsz, N, blockelems2) would yield a pointer-to-shared that is upcr_isequal() to sptr1
    (if N > 0, we say that sptr1 is "greater than" sptr2, and if N < 0 we say that sptr1 is "less than" sptr2)
  otherwise, fatal error if there is no value which can be added to sptr1 to make it equal sptr2
   (e.g. sptr1 and sptr2 are indef blocksize pointers with different affinities)
*/

ptrdiff_t upcr_sub_shared  (upcr_shared_ptr_t  sptr1, upcr_shared_ptr_t  sptr2, size_t elemsz, size_t blockelems);
ptrdiff_t upcr_sub_psharedI(upcr_pshared_ptr_t sptr1, upcr_pshared_ptr_t sptr2, size_t elemsz);
ptrdiff_t upcr_sub_pshared1(upcr_pshared_ptr_t sptr1, upcr_pshared_ptr_t sptr2, size_t elemsz);


/* Affinity checks - return non-zero iff the given pointer-to-shared currently
    has affinity to the calling thread (or indicated thread, respectively)
 */
int upcr_hasMyAffinity_shared (upcr_shared_ptr_t sptr);
int upcr_hasMyAffinity_pshared(upcr_pshared_ptr_t sptr);

int upcr_hasAffinity_shared (upcr_shared_ptr_t sptr,  upcr_thread_t threadid);
int upcr_hasAffinity_pshared(upcr_pshared_ptr_t sptr, upcr_thread_t threadid);

/* ------------------------------------------------------------------------------------ */
/*
  Shared Memory Accesses
  ======================
  Transfer scalar values to/from shared memory which may or may not be remote

  These comments apply to all put/get functions:

    Only functions suffixed with '_strict' can be used to implement a strict
     operation: all other data movement functions in this specification are
     implicitly relaxed.
    nbytes should be a compile-time constant whenever possible
    nbytes must be >= 0 and has no maximum size, but implementations
     will likely optimize for small powers of 2
    source and target addresses (both local and shared) are assumed to be properly 
     aligned for accessing objects of size nbytes 
    if nbytes extends beyond the current block the results are undefined
    destoffset(srcoffset) is an optional positive or negative BYTE offset, which is added to 
      the address indicated by dest(src) to determine the target(source) address for the put(get) operation 
      (Useful for puts(gets) with shared structures)
    if adding the number of bytes indicated by destoffset(srcoffset) to dest(src) would cause
      dest(src) to pass the end of the current block, the result is undefined
    If the source and target memory areas overlap (for memory-to-memory transfers) but do not exactly coincide, 
      the resulting target memory contents are undefined
    Implementations are likely to optimize for the important special case of zero destoffset(srcoffset)
*/

/* UPCR_ATOMIC_MEMSIZE() is a macro describing the datatype sizes at which memory accesses
     will be done atomically. Given a datatype width sz (in bytes) it will return non-zero at compile time
     iff a local or shared memory access of exactly sz bytes, to an address aligned by sz bytes, will happen 
     atomically with respect to accesses from other threads to the same location.
   A non-zero return value for a given size does not guarantee atomicity for smaller sizes
     or unaligned accesses of the given size
   Some architectures may provide no atomic sizes
   UPCR_ATOMIC_MEMSIZE(0) will return the largest atomic size available, or zero if none exists
*/
#define UPCR_ATOMIC_MEMSIZE(sz) ???

/* --- Blocking memory-to-memory puts and gets ---
  A call to these functions will block until the transfer is complete, 
    and the contents of the destination memory are undefined until it completes. 
  If the contents of the source memory change while the operation is in progress 
    the result will be implementation-specific.
  The '_strict' versions implement strict UPC puts/gets.  It is an error for any
    nonblocking (relaxed or strict) operation to overlap a strict put/get.
*/
void upcr_put_shared (upcr_shared_ptr_t dest,  ptrdiff_t destoffset, const void *src, size_t nbytes);
void upcr_put_pshared(upcr_pshared_ptr_t dest, ptrdiff_t destoffset, const void *src, size_t nbytes);
void upcr_put_shared_strict (upcr_shared_ptr_t dest,  ptrdiff_t destoffset, const void *src, size_t nbytes);
void upcr_put_pshared_strict(upcr_pshared_ptr_t dest, ptrdiff_t destoffset, const void *src, size_t nbytes);

void upcr_get_shared (void *dest, upcr_shared_ptr_t  src, ptrdiff_t srcoffset, size_t nbytes);
void upcr_get_pshared(void *dest, upcr_pshared_ptr_t src, ptrdiff_t srcoffset, size_t nbytes);
void upcr_get_shared_strict (void *dest, upcr_shared_ptr_t  src, ptrdiff_t srcoffset, size_t nbytes);
void upcr_get_pshared_strict(void *dest, upcr_pshared_ptr_t src, ptrdiff_t srcoffset, size_t nbytes);

/* --- Non-blocking operations --- 
  The following functions provide non-blocking, split-phase memory access to shared data.
  All such non-blocking operations require an initiation (put or get) and a subsequent
    synchronization on the completion of that operation before the result is guaranteed.
  Synchronization of a get operation means the local result is ready to be examined, and
    will contain a value held by the shared location at some time in the interval between
    the call to the initiation function and the successful completion of the synchronization
    (note this specifically allows implementations to delay the underlying read 
     until the synchronization operation is called, provided they preserve the blocking
     semantics of the synchronization function)
  Synchronization of a put operation means the source data has been written to the shared location
    and get operations issued subsequently by any thread will receive the new value or a 
    subsequently written value (assuming no other threads are writing the location)
  There are two categories of non-blocking operations:
    "explicit handle" (nb) - return a specific handle to caller which is used for synchronization
      this handle can be used to synchronize a specific subset of the nb operations in-flight
    "implicit handle" (nbi) - don't return a handle - synchronization is accomplished
      by calling a synchronization routine that synchronizes all outstanding nbi operations

  Note that the order in which non-blocking operations complete is intentionally unspecified - 
    the system is free to coalesce and/or reorder non-blocking operations with respect to other 
    blocking or non-blocking operations, or operations initiated from a separate thread - 
    the only ordering constraints that must be satisfied are those explicitly enforced using 
    the synchronization functions (i.e. the non-blocking operation is only guaranteed to occur 
    somewhere in the interval between initiation and successful synchronization on that operation).

  The compiler bears full responsibility for maintaining the memory consistency semantics
    presented to the UPC user when using non-blocking operations - the compiler must generate
    synchronizations at the appropriate points (e.g. before calling upcr_unlock() or upcr_notify())

  Implementors should attempt to make the non-blocking operations return as quickly as possible - 
    however in some cases (e.g. when a large number of non-blocking operations have been issued 
    or the network is otherwise busy) it may be necessary to block temporarily while waiting 
    for the network to become available. In any case, all implementations must support an 
    unlimited number of non-blocking operations in-progress - that is, the client is free 
    to issue an unlimited number of non-blocking operations before issuing a sync operation, 
    and implementations must handle this correctly without deadlock. 

  The '_strict' versions of these functions implement strict nonblocking UPC puts/gets.  
    It is an error for any nonblocking (relaxed or strict) operation to overlap a strict put/get.  
    Only one strict nonblocking operation may be pending at any time, and no other operation 
    (relaxed or strict) may be initiated or completed in between that strict operation's
    initiation and its completion.
*/

/* upcr_handle_t is a datatype used for representing a non-blocking
     operation currently in-flight that was initiated with an "explicit handle" 
     non-blocking operation. The contents are implementation-defined.
   UPCR_INVALID_HANDLE is a compile-time constant which can be used as a "dummy"
     handle value, which is ignored by all the operations that take upcr_handle_t's
     furthermore this value must be the result of setting all the bits in the 
     upcr_handle_t datatype to zero.
   Implementations are free to define the upcr_handle_t type to be any 
     reasonable and appropriate size, although they are recommended to use a 
     type which fits within a single standard register on the target architecture. 
     In any case, the datatype should be wide enough to express at least 2^16-1 
     different handle values, to prevent limiting the number of non-blocking operations 
     in progress due to the number of handles available.
   upcr_handle_t values are thread-specific. 
     In other words, it is an error to obtain a handle value by initiating a non-blocking 
     operation on one thread, and later pass that handle into a synchronization function 
     from a different thread (results are undefined).
   Similarly, synchronization functions for "implicit handle" non-blocking operations
     only synchronize on "implicit handle" operations initiated from the calling thread.
   It _is_ legal to pass upcr_handle_t values into function callees or back to function callers
*/
typedef ??? upcr_handle_t;
#define UPCR_INVALID_HANDLE ???

/* --- Non-blocking memory-to-memory, explicit handle (nb) ---
  These calls initiate a non-blocking operation and return "immediately" with a 
    non-blocking handle that can be used to later synchronize the operation, 
    using one of the explicit sync operations
  Once the put version returns, the source memory may safely be overwritten
  For the get version, if the contents of the source memory change while the 
    operation is in progress the result will be implementation-specific.
  The contents of the destination memory address are undefined until a 
    synchronization completes successfully for the non-blocking operation. 
  The operations may return UPCR_INVALID_HANDLE to indicate it was possible to complete
    the operation immediately without blocking (e.g. operations on shared memory 
    with affinity to this thread)
  It is an error to discard the upcr_handle_t value for an operation in-flight - 
    i.e. to initiate an operation and never synchronize on its completion
*/
upcr_handle_t upcr_put_nb_shared (upcr_shared_ptr_t dest,  ptrdiff_t destoffset, 
                                  const void *src, size_t nbytes);
upcr_handle_t upcr_get_nb_shared (void *dest, upcr_shared_ptr_t  src, ptrdiff_t srcoffset, size_t nbytes);

upcr_handle_t upcr_put_nb_pshared(upcr_pshared_ptr_t dest, ptrdiff_t destoffset, 
                                  const void *src, size_t nbytes);
upcr_handle_t upcr_get_nb_pshared(void *dest, upcr_pshared_ptr_t src, ptrdiff_t srcoffset, size_t nbytes);

upcr_handle_t upcr_put_nb_shared_strict (upcr_shared_ptr_t dest,  ptrdiff_t destoffset, 
                                         const void *src, size_t nbytes);
upcr_handle_t upcr_get_nb_shared_strict (void *dest, upcr_shared_ptr_t  src, ptrdiff_t srcoffset, 
                                         size_t nbytes);
upcr_handle_t upcr_put_nb_pshared_strict (upcr_pshared_ptr_t dest, ptrdiff_t destoffset, 
                                          const void *src, size_t nbytes);
upcr_handle_t upcr_get_nb_pshared_strict(void *dest, upcr_pshared_ptr_t src, 
                                         ptrdiff_t srcoffset, size_t nbytes);

/* --- Explicit handle synchronization (for get_nb and put_nb) ---
  upcr supports two basic variants of synchronization for non-blocking operations - 
    trying (polling) and waiting (blocking). 
  All explicit synchronization functions take one or more upcr_handle_t values 
    as input and either return an indication of whether the operation has completed 
    or block until it completes. 
*/

/* Single operation explicit synchronization
  Synchronize on the completion of a single, particular non-blocking operation 
    that was initiated by this thread. 
  upcr_wait_syncnb() blocks until the specified operation has completed 
    (or returns immediately if it has already completed). 
    In any case, the handle value is "dead" after upcr_wait_syncnb() returns and 
    may not be passed to future synchronization operations 
  upcr_try_syncnb() always returns immediately, with the value 1 if the 
    operation is complete (at which point the handle value is "dead", and may 
    not be used in future synchronization operations), or 0 if the operation is 
    not yet complete and future synchronization is necessary to complete this operation.

  upcr_{try,wait}_syncnb_strict() operate just as upcr_{try.wait}_syncnb() do,
  but must be used for strict operations (and only for strict operations).
 
  It is legal to pass UPCR_INVALID_HANDLE as input to these functions:
  upcr_wait_syncnb{,_strict}(UPCR_INVALID_HANDLE) return immediately and
  upcr_try_syncnb{,_strict}(UPCR_INVALID_HANDLE) return 1.

  It is an error to pass a upcr_handle_t value for an operation which has already 
    been successfully synchronized using one of the explicit synchronization functions
    and doing so has undefined results
*/
void upcr_wait_syncnb(upcr_handle_t handle)
int  upcr_try_syncnb(upcr_handle_t handle)
void upcr_wait_syncnb_strict(upcr_handle_t handle)
int  upcr_try_syncnb_strict(upcr_handle_t handle)

/* Multiple operation explicit synchronization 
  Synchronize on the completion of an array of non-blocking operation handles 
    (all of which were initiated by this thread). 
  numhandles specifies the number of handles in the provided array of handles. 
    requires numhandles >= 0
  upcr_wait_syncnb_all() blocks until all the specified operations have completed 
    (or returns immediately if they have all already completed). 
  upcr_try_syncnb_all() always returns immediately, with the value 1 if all 
    the specified operations have completed, or 0 if one or more of the operations 
    is not yet complete and future synchronization is necessary to complete some 
    of the operations.

  upcr_try_syncnb_all() will modify the provided array to reflect completions - 
    handles whose operations have completed are overwritten with the value UPCR_INVALID_HANDLE, 
    and the client may test against this value when upcr_try_syncnb_all() returns 0 
    to determine which operations are complete and which are still pending.

  implementations of upcr_wait_syncnb_all() _may_ modify the provided array to reflect completions, 
    but this is not required (and not necessarily for the client since it always blocks until all 
    operations in the list are complete)

  It is legal to pass the value UPCR_INVALID_HANDLE in some or all of the array entries, 
    and both functions will ignore them so those values have no effect on behavior. 
  If all entries in the array are UPCR_INVALID_HANDLE (or numhandles==0), then 
    upcr_try_syncnb_all() will return 1.
  Note that there are no strict variants, since the UPC memory consistency
    model prohibits multiple outstanding strict operations.
*/

void upcr_wait_syncnb_all(upcr_handle_t *, size_t numhandles)
int  upcr_try_syncnb_all(upcr_handle_t *, size_t numhandles)

/* 
  These operate analogously to the syncnb_all variants, except they only 
    wait/test for at least one operation corresponding to a _valid_ handle in the 
    provided list to be complete (the valid handles values are all those which are not 
    UPCR_INVALID_HANDLE). 
  Specifically, upcr_wait_syncnb_some() will block until at least one of the 
    valid handles in the list has completed, and indicate the operations that have 
    completed by setting the corresponding handles to the value UPCR_INVALID_HANDLE. 
  Similarly, upcr_try_syncnb_some will check if at least one valid handle in the 
    list has completed (setting all completed handles to UPCR_INVALID_HANDLE) and 
    return 1 if it detected at least one completion or 0 otherwise (except as below)

  Both functions ignore UPCR_INVALID_HANDLE values. If the input list is empty or 
    consists only of UPCR_INVALID_HANDLE values, upcr_wait_syncnb_some will 
    return immediately and upcr_try_sync_some will return 1.
*/

void upcr_wait_syncnb_some(upcr_handle_t *, size_t numhandles)
int  upcr_try_syncnb_some(upcr_handle_t *, size_t numhandles)


/* --- Non-blocking memory-to-memory, implicit handle (nbi) ---
  These calls initiate a non-blocking operation and return "immediately" 
    the operation must later be completed using a call to one of the implicit sync functions
  Once the put version returns, the source memory may safely be overwritten
  For a get operation, if the contents of the source memory change while the operation is in progress 
    the result will be implementation-specific.
  The contents of the destination memory address are undefined until a 
    synchronization completes successfully for the non-blocking operation. 
  There are no strict nbi operations, as the UPC memory consistency model prohibits multiple 
    outstanding strict operations.
*/
void upcr_put_nbi_shared (upcr_shared_ptr_t dest,  ptrdiff_t destoffset, 
                          const void *src, size_t nbytes);
void upcr_get_nbi_shared (void *dest, upcr_shared_ptr_t  src, ptrdiff_t srcoffset, size_t nbytes);

void upcr_put_nbi_pshared(upcr_pshared_ptr_t dest, ptrdiff_t destoffset, 
                          const void *src, size_t nbytes);
void upcr_get_nbi_pshared(void *dest, upcr_pshared_ptr_t src, ptrdiff_t srcoffset, size_t nbytes);

/* --- Implicit handle synchronization (for get_nbi and put_nbi) --- */
/* Synchronize on an implicit list of outstanding non-blocking operations.
  These functions implicitly specify a set of non-blocking operations on which to synchronize - 
    either all outstanding implicit-handle gets initiated by this thread, 
    all outstanding implicit-handle puts initiated by this thread, 
    or all outstanding implicit-handle operations (both puts and gets) initiated by this thread 
    (where outstanding is defined as all those operations which have been initiated 
    but not yet completed through a successful implicit-handle synchronization). 
  The wait variants block until all operations in this implicit set have completed 
  The try variants test whether all operations in the implicit set have completed, 
    and return 1 if so (or if there are no outstanding implicit-handle operations)
    or 0 otherwise 
  Implicit synchronization functions will synchronize operations initiated within
    other function frames by this thread
  As with the initiation functions, there are no strict variants here.
*/

void upcr_wait_syncnbi_gets();
void upcr_wait_syncnbi_puts();
void upcr_wait_syncnbi_all();
int  upcr_try_syncnbi_gets();
int  upcr_try_syncnbi_puts();
int  upcr_try_syncnbi_all();

/* --- Implicit region synchronization --- */
/* In some cases, it may be useful or desirable to initiate a number of non-blocking 
     shared-memory operations (possibly without knowing how many at compile-time) and
     synchronize them at a later time using a single, fast synchronization.
   Simple implicit handle synchronization may not be appropriate for this situation if
     there are intervening implicit accesses which are not to be synchronized.
   This situation could be handled using explicit-handle non-blocking operations and a
     list synchronization (e.g. upcr_wait_syncnb_all()), but this may not be desirable
     because it requires managing an array of handles (which could have negative cache
     effects on performance, or could be expensive to allocate when the size is not known
     until runtime).
   To handle these cases, we provide "implicit access region" synchronization, described below.
*/

/* upcr_begin_nbi_accessregion() and upcr_end_nbi_accessregion() are used to define an 
     implicit access region (any code which dynamically executes between the begin and
     end calls is said to be "inside" the region)
   The begin and end calls must be paired, and may not be nested recursively or the results
     are undefined.
   It is erroneous to call any implicit-handle synchronization function within the region.
   All implicit-handle non-blocking operations initiated inside the region become "associated"
     with the abstract region handle being constructed. upcr_end_nbi_accessregion() returns an 
     explicit handle which collectively represents all the associated implicit-handle operations 
     (those initiated within the region). 
   This handle can then be passed to the regular explicit-handle synchronization functions, 
     and will be successfully synchronized when all of the associated non-blocking 
     operations initiated in the region have completed. 
   The associated operations cease to be implicit-handle operations, and are _not_ synchronized
     by subsequent calls to the implicit-handle synchronization functions (e.g. upcr_wait_syncnbi_all())
   Explicit-handle operations initiated within the region operate as usual and do _not_ become
     associated with the region.
*/
void          upcr_begin_nbi_accessregion();
upcr_handle_t upcr_end_nbi_accessregion();
/* sample code:

  upcr_begin_nbi_accessregion(); // begin the region

  upcr_put_nbi_shared(...); // becomes associated with this region
  while (...) {
    upcr_put_nbi_shared(...); // becomes associated with this region
  }

  h2 = upcr_get_nb_shared(...); // unrelated explicit-handle operation not associated with region
  upcr_wait_syncnb(h2);

  handle = upcr_end_nbi_accessregion(); // end the region and get the handle

  .... // other code, which may include unrelated implicit-handle operations+syncs, or other regions, etc

  upcr_wait_syncnb(handle); // wait for all the operations associated with the region to complete
  */

/* --- Register-memory operations --- */

/* upcr_register_value_t represents the largest unsigned integer type that can fit entirely
     in a single CPU register for the current architecture and ABI. 
   SIZEOF_UPCR_REGISTER_T is a preprocess-time literal integer constant (i.e. not "sizeof()")
     indicating the size of this type in bytes
 */
typedef unsigned ??? upcr_register_value_t;
#define SIZEOF_UPCR_REGISTER_VALUE_T ???

/* the value forms of put - these take the value to be put as input parameter to avoid
     forcing outgoing values to local memory in generated code. 
   Otherwise, the behavior is identical to the memory-to-memory versions of put above
   requires: nbytes > 0 && nbytes <= SIZEOF_UPCR_REGISTER_VALUE_T
   The value written to the target address is a direct byte copy of the 8*nbytes low-order bits of value,
     written with the endianness appropriate for an nbyte integral value on the current architecture
   The non-blocking forms of value put must be synchronized using the explicit or implicit
     synchronization functions defined above, as appropriate
   The semantics of the _strict versions are the same as for the regular, non-value put/get functions
*/
void upcr_put_shared_val (upcr_shared_ptr_t dest, ptrdiff_t destoffset, 
			  upcr_register_value_t value, size_t nbytes);
void upcr_put_shared_val_strict (upcr_shared_ptr_t dest, ptrdiff_t destoffset, 
			         upcr_register_value_t value, size_t nbytes);
upcr_handle_t upcr_put_nb_shared_val (upcr_shared_ptr_t dest, ptrdiff_t destoffset, 
				      upcr_register_value_t value, size_t nbytes);
upcr_handle_t upcr_put_nb_shared_val_strict (upcr_shared_ptr_t dest, ptrdiff_t destoffset, 
				             upcr_register_value_t value, size_t nbytes);
void upcr_put_nbi_shared_val(upcr_shared_ptr_t dest, ptrdiff_t destoffset, 
			     upcr_register_value_t value, size_t nbytes);

void upcr_put_pshared_val (upcr_pshared_ptr_t dest, ptrdiff_t destoffset, 
			   upcr_register_value_t value, size_t nbytes);
void upcr_put_pshared_val_strict (upcr_pshared_ptr_t dest, ptrdiff_t destoffset, 
			          upcr_register_value_t value, size_t nbytes);
upcr_handle_t upcr_put_nb_pshared_val (upcr_pshared_ptr_t dest, ptrdiff_t destoffset, 
				       upcr_register_value_t value, size_t nbytes);
upcr_handle_t upcr_put_nb_pshared_val_strict (upcr_pshared_ptr_t dest, ptrdiff_t destoffset, 
				              upcr_register_value_t value, size_t nbytes);
void upcr_put_nbi_pshared_val(upcr_pshared_ptr_t dest, ptrdiff_t destoffset, 
			      upcr_register_value_t value, size_t nbytes);


/* blocking value get - these return the fetched value to avoid
     forcing incoming values to local memory in generated code. 
   Otherwise, the behavior is identical to the memory-to-memory blocking get
   requires: nbytes > 0 && nbytes <= SIZEOF_UPCR_REGISTER_VALUE_T
   The value returned is the one obtained by reading the nbytes bytes starting at the source address
     with the endianness appropriate for an nbyte integral value on the current architecture
     and setting the high-order bits (if any) to zero (i.e. no sign-extension)
   The semantics of the _strict versions are the same as for the regular,
     non-value put/get functions
*/
upcr_register_value_t upcr_get_shared_val (upcr_shared_ptr_t  src, ptrdiff_t srcoffset, size_t nbytes);
upcr_register_value_t upcr_get_shared_val_strict (upcr_shared_ptr_t  src, ptrdiff_t srcoffset, size_t nbytes);
upcr_register_value_t upcr_get_pshared_val(upcr_pshared_ptr_t src, ptrdiff_t srcoffset, size_t nbytes);
upcr_register_value_t upcr_get_pshared_val_strict(upcr_pshared_ptr_t src, ptrdiff_t srcoffset, size_t nbytes);

/* non-blocking value get - useful for NIC's that can target register-like 
     storage such as T3E's eregisters or Quadric's memory-mapped NIC FIFO's
   these operate similarly to the blocking form of value get, but are split-phase
   upcr_get_nb_(p)shared_val initiates a non-blocking value get and 
     returns an explicit handle which MUST be synchronized using upcr_wait_syncnb_valget()
   upcr_wait_syncnb_valget() synchronizes an outstanding get_nb_val operation and 
     returns the retrieved value as described for the blocking version
   Note that upcr_valget_handle_t and upcr_handle_t are completely different datatypes
     and may not be intermixed (i.e. upcr_valget_handle_t's cannot be used with other explicit
     synchronization functions, and upcr_handle_t's cannot be passed to upcr_wait_syncnb_valget()
   There is no try variant of value get synchronization, and no "nbi" variant
   Implementors are recommended to make sizeof(upcr_valget_handle_t) <= sizeof(upcr_register_value_t)
     to facilitate register reuse
*/
typedef ??? upcr_valget_handle_t;

upcr_valget_handle_t upcr_get_nb_shared_val(upcr_shared_ptr_t   src, ptrdiff_t srcoffset, size_t nbytes);
upcr_valget_handle_t upcr_get_nb_shared_val_strict(upcr_shared_ptr_t src, ptrdiff_t srcoffset, size_t nbytes);
upcr_valget_handle_t upcr_get_nb_pshared_val(upcr_pshared_ptr_t src, ptrdiff_t srcoffset, size_t nbytes);
upcr_valget_handle_t upcr_get_nb_pshared_val_strict(upcr_pshared_ptr_t src, ptrdiff_t srcoffset, size_t nbytes);

upcr_register_value_t upcr_wait_syncnb_valget(upcr_valget_handle_t handle);

/* Blocking value puts/gets for floating-point quantities (float, double)
   these operate similarly to the blocking value puts/get for integral types, except are specialized
     for the float and double types on the current platform
   the source/target address is assumed to be correctly aligned for accessing the given FP type
   the primary motivation is to permit puts/gets directly between local shared memory locations
     and the floating point registers, without forcing the use of an integer register 
     or stack temporary as an intermediary (which would be otherwise necessary without these functions)
   there are no non-blocking variants for these functions because they are meant primarily for 
     optimizing low-latency local memory accesses
*/
void   upcr_put_shared_floatval (upcr_shared_ptr_t dest, ptrdiff_t destoffset, float  value);
void   upcr_put_shared_floatval_strict (upcr_shared_ptr_t dest, ptrdiff_t destoffset, float  value);
void   upcr_put_shared_doubleval   (upcr_shared_ptr_t dest, ptrdiff_t destoffset, double value);
void   upcr_put_shared_doubleval_strict (upcr_shared_ptr_t dest, ptrdiff_t destoffset, double value);
float  upcr_get_shared_floatval  (upcr_shared_ptr_t src, ptrdiff_t srcoffset);
float  upcr_get_shared_floatval_strict (upcr_shared_ptr_t src, ptrdiff_t srcoffset);
double upcr_get_shared_doubleval (upcr_shared_ptr_t src, ptrdiff_t srcoffset);
double upcr_get_shared_doubleval_strict (upcr_shared_ptr_t src, ptrdiff_t srcoffset);

void   upcr_put_pshared_floatval (upcr_pshared_ptr_t dest, ptrdiff_t destoffset, float  value);
void   upcr_put_pshared_floatval_strict (upcr_pshared_ptr_t dest, ptrdiff_t destoffset, float  value);
void   upcr_put_pshared_doubleval (upcr_pshared_ptr_t dest, ptrdiff_t destoffset, double value);
void   upcr_put_pshared_doubleval_strict (upcr_pshared_ptr_t dest, ptrdiff_t destoffset, double value);
float  upcr_get_pshared_floatval (upcr_pshared_ptr_t src, ptrdiff_t srcoffset);
float  upcr_get_pshared_floatval_strict (upcr_pshared_ptr_t src, ptrdiff_t srcoffset);
double upcr_get_pshared_doubleval (upcr_pshared_ptr_t src, ptrdiff_t srcoffset);
double upcr_get_pshared_doubleval_strict (upcr_pshared_ptr_t src, ptrdiff_t srcoffset);

/* ------------------------------------------------------------------------------------ */
/*
  Shared Memory Bulk Memory Operations
  ====================================
  Transfer bulk data to/from shared memory which may be remote
  Note these operations all take upcr_shared_ptr_t's (not phaseless ptrs)
  All sizes are specified in BYTES, nbytes >= 0
  Semantics are the same as those specified in the UPC spec
  Implementations will likely optimize for larger values of nbytes
  If the source and target memory areas overlap (but do not exactly coincide), 
    the resulting target memory contents are undefined

  The motivation for having memget and memput, separately from the memory ops above:
    - well defined semantics for crossing block boundaries
    - no alignment constraints on the pointers
    - non-blocking memput constrains source memory from changing while operation is 
      in progress to avoid a potential buffering copy
    - optimize for large sizes

  Implementor's notes: 
    upcr_memset() can be implemented on GASNet using a single small active message, which makes
      it very efficient in terms of network communication
*/

void upcr_memget(void *dst, upcr_shared_ptr_t src, size_t nbytes);
void upcr_memput(upcr_shared_ptr_t dst, const void *src, size_t nbytes);
void upcr_memcpy(upcr_shared_ptr_t dst, upcr_shared_ptr_t src, size_t nbytes);
void upcr_memset(upcr_shared_ptr_t dst, int c, size_t nbytes);

/* non-blocking versions of the bulk memory operations 
   must be synchronized using explicit or implicit synchronization as with 
     non-blocking scalar memory access operations
   The contents of the memory referenced by src must NOT change between 
     initiation and successful synchronization, or the result is undefined
   upcr_nbi_memset is synchronized as if it were an implicit-handle put operation
*/

upcr_handle_t upcr_nb_memget(void *dst, upcr_shared_ptr_t src, size_t nbytes);
upcr_handle_t upcr_nb_memput(upcr_shared_ptr_t dst, const void *src, size_t nbytes);
upcr_handle_t upcr_nb_memcpy(upcr_shared_ptr_t dst, upcr_shared_ptr_t src, size_t nbytes);
upcr_handle_t upcr_nb_memset(upcr_shared_ptr_t dst, int c, size_t nbytes);

void upcr_nbi_memget(void *dst, upcr_shared_ptr_t src, size_t nbytes);
void upcr_nbi_memput(upcr_shared_ptr_t dst, const void *src, size_t nbytes);
void upcr_nbi_memcpy(upcr_shared_ptr_t dst, upcr_shared_ptr_t src, size_t nbytes);
void upcr_nbi_memset(upcr_shared_ptr_t dst, int c, size_t nbytes);


/* ------------------------------------------------------------------------------------ */
/*
  Dynamic Memory Allocation:
  =========================
  UPC runtime interface to generated code for memory allocation
*/

/* Non-collective operation that allocates nbytes in the shared memory area
     with affinity to this thread, and returns a pointer to the new data, 
     which is suitably aligned for any kind of variable. 
   Requires nbytes >= 0
   The memory is not cleared or initialized in any way, although it has been properly 
     registered with the network system in a way appropriate for the current platform such 
     that remote threads can read and write to the memory using upcr shared data transfer operations. 
   If insufficient memory is available, the function will print an implementation-defined 
     error message and terminate the job.

   NOTE: this replaces upcr_local_alloc() which existed in previous versions of this spec.
*/

upcr_shared_ptr_t upcr_alloc(size_t nbytes);

/* Non-collective operation that allocates nblocks * blocksz bytes spread across the shared memory area 
     of 1 or more threads, and returns a pointer to the new data, 
     which is suitably aligned for any kind of variable. 
   Requires nblocks >= 0 and blocksz >= 0

   The memory is blocked across all the threads as if it had been created by the UPC declaration: 

      shared [blocksz] char[nblocks * blocksz] (i.e. both sizes are expressed in bytes).

   Specifically, thread i allocates (at least): 

      Max({0} union {0 < n <= nblocks * blocksz | (floor(n-1/blocksz) % THREADS) == i})  bytes.

   More specifically, thread i allocates (at least) this many bytes:

      blocksz *  ceil(nblocks/THREADS)  if i <= (nblocks % THREADS)
      blocksz * floor(nblocks/THREADS)  if i >  (nblocks % THREADS)

   Implementor's note: Some implementations may allocate the full (blocksz * ceil(nblocks/THREADS)) 
     memory on each thread for simplicity, even though less may be required on some threads.

   Note if nblocks == 1, then all the memory will be allocated in the shared memory space 
     of thread 0 (and implementations should attempt not to waste space on other threads in this 
     common special case).

   In all cases the returned pointer will point to a memory location in the shared memory space 
     of thread 0, and any subsequent chunks in the shared space of other threads will be logically 
     aligned with this pointer (such that incrementing a pointer-to-shared of the appropriate blocksz 
     past the end of a block on one thread will bring it to the start of the next block on the next thread).

   The phase of the returned pointer is set to zero

   The memory is not cleared or initialized in any way, although it has been properly registered 
     with the network system in a way appropriate for the current platform such that remote threads 
     can read and write to the memory using the upcr shared data transfer operations. 
   If insufficient memory is available, the function will print an implementation-defined error 
     message and terminate the job.
*/

upcr_shared_ptr_t upcr_global_alloc(size_t nblocks, size_t blocksz);


/* Collective version of upcr_global_alloc() - the semantics are identical to upcr_global_alloc() 
    with the following exceptions:
     * the function must be called by all threads during the same synchronization phase, 
       and all threads must provide the same arguments
     * may act as a barrier for all threads, but might not in some implementations
     * all threads receive a copy of the result, and the pointer-to-shared values will 
       compare equal (according to upcr_isequal_shared_shared()) on all threads
*/

upcr_shared_ptr_t upcr_all_alloc(size_t nblocks, size_t blocksz);

/* Non-collective operation used to deallocate a shared memory region previously allocated 
     (but not deallocated) using one of: upcr_alloc(), upcr_global_alloc() or upcr_all_alloc().
   If sptr is a null pointer the operation is ignored. 
   The pointer-to-shared value passed to upcr_free() must be the same value returned by the 
     allocation function that created the region (i.e. it must point to the beginning of the object, 
     and for upcr_global_alloc() and upcr_all_alloc() the thread field must indicate thread 0). 
   If sptr has been freed by a previous call to upcr_free() or upcr_all_free(), or does not point
     to the beginning of a live object in shared memory, the behavior is undefined.

   Note that any thread may call upcr_free() to free a given dynamically-allocated shared object, 
     even if that object was created by a call to upcr_alloc() from a different thread.

   Also note that memory allocated using upcr_all_alloc() should only be freed by a call to upcr_free() 
     from a _single_ thread.   See upcr_all_free(), below, for a collective free call.
*/

void upcr_free(upcr_shared_ptr_t sptr) 

/* Collective operation used to deallocate a shared memory region previously allocated
     (but not deallocated) using one of: upcr_alloc(), upcr_global_alloc() or upcr_all_alloc().
   If sptr is a null pointer the operation is ignored.
   The pointer-to-shared value passed to upcr_free() must be the same value returned by the
     allocation function that created the region (i.e. it must point to the beginning of the object,
     and for upcr_global_alloc() and upcr_all_alloc() the thread field must indicate thread 0).
   If sptr has been freed by a previous call to upcr_free() or upcr_all_free(), or does not point to
     the beginning of a live object in shared memory, the behavior is undefined.

   This call must be called collectively by all threads with a single-valued argument.

   The memory remains valid until all threads have entered, but barrier synchronization is
     neither guaranteed nor prohibited.
*/

void upcr_all_free(upcr_shared_ptr_t sptr)

/* ------------------------------------------------------------------------------------ */
/*
  Barrier
  =======
  The runtime provides split-phase barrier support
*/

#define UPCR_BARRIERFLAG_ANONYMOUS ???

/* Execute the notification for a split-phase barrier, with a barrier value
   This is a non-blocking operation that completes immediately after noting the barrier value
   No synchronization is performed on outstanding memory accesses (i.e. the 
     compiler is responsible for inserting the appropriate syncs to implement
     the null strict reference implied by upc_notify before calling upcr_notify())
   Generates a fatal error if this is the second call to upcr_notify() on this thread 
     since the last call to upcr_wait() or the beginning of the program
   flags should be 0 to indicate a normal barrier (which carries the value barrierval)
     or UPCR_BARRIERFLAG_ANONYMOUS to indicate an "anonymous" barrier, where the 
     barrierval argument is ignored and the notify automatically "matches" with any 
     anonymous or non-anonymous value provided by the notify called on other threads
   Implementation notes:
     check value of thread's notify/wait toggle which records current state of synchronization
     save this thread's barrier value and flags
     increment a counter of local threads that called notify this epoch & return
     last thread on this node to call upcr_notify()
       checks the barrier values calls gasnet_notify() with appropriate flags then resets the counter
*/
void upcr_notify(int barrierval, int flags);

/* Execute the wait for a split-phase barrier, with a barrier value
   This is a blocking operation that returns only after all threads have called upcr_notify()
   No synchronization is performed on outstanding memory accesses (i.e. the 
    compiler is responsible for inserting the appropriate syncs to implement
    the null strict reference implied by upc_wait after calling upcr_wait())
   Generates a fatal error if there were no preceding calls to upcr_notify() from this thread,
     or if this is the second call to upcr_wait() since the last call to upcr_notify() on this thread
   Generates a fatal error if flags is not equal to the flags value passed in the preceding 
     upcr_notify() call made by this thread 
   Generates a fatal error if flags==0 and the supplied barrierval doesn't match the value provided 
     in the preceding upcr_notify() call made by this thread 
   Generates a fatal error if any two threads passed non-anonymous barrier values which didn't match
     during the notify calls which began this barrier phase
     
   Implementation notes:
     check and toggle value of thread's notify/wait status which records current state of synchronization
     check that i matches previous value provided by thread in this barrier epoch   
     first thread to enter grabs a lock, spin waits until all threads have called notify (counter reset)
       calls gasnet_wait with appropriate flags, (aborts if there is a mismatch reported)
       and signals that wait is complete by writing a barrier_done flag
     all other threads either block on the lock (if they arrive during gasnet_wait) or merely 
       see that wait is complete and return the mismatch value
     increment a counter of local threads that called wait this epoch & block (sleep or spin-wait)
     last thread on this node to call upcr_wait calls gasnet_wait(i), then releases the other threads 
     when it returns
*/
void upcr_wait(int barrierval, int flags);

/* upcr_try_wait() functions similarly to upcr_wait(), except that it always returns immediately. 
   If the barrier has been notified by all threads, the call behaves as a call to upcr_wait()
     with the same barrierval and flags, and returns the value 1
   If the barrier has not yet been notified by some thread, 
     the call is a no-op and returns  the value 0
   Note this call is not mandated by the UPC spec, but may be useful for performing purely local
    computation in optimized code or performing system housekeeping duties
*/
int upcr_try_wait(int barrierval, int flags);

/* ------------------------------------------------------------------------------------ */
/*
 * Network polling
 * ===============
 *
 * The upcr_poll() function explicitly causes the runtime to attempt to make
 * progress on any network requests that may be pending.  While many other
 * runtime functions implicitly do this as well (i.e. most of those which call
 * the network layer) this function may be useful in cases where a large amount
 * of time has elapsed since the last runtime call (e.g. if a great deal of
 * application-level calculation is taking place).  This function may also be
 * indirectly when a upc_fence is used.
 */
void upcr_poll();

/* ------------------------------------------------------------------------------------ */
/*
  UPC locks
  =========
  The following assumes the updates in the UPC spec 1.1 regarding upc locks,
  namely:
    - upc_lock_t is an opaque shared datatype with incomplete type (prohibits 
      statically-allocated upc_lock_t objects)
    - upc_lock_init() is no longer necessary or useful and is removed
    - upc_lock_free() is added to allow users to free dynamically-allocated locks
    - UPC locks are _not_ recursive (a thread must not attempt to re-acquire a lock it already owns)

  similar to upc_lock_t, the runtime lock datatype is totally opaque and
   always manipulated through upcr_shared_ptr_t pointers, which must NEVER be
   dereferenced by generated code
  this spec intentionally doesn't even provide a name or size for the lock datatype
  the pointer-to-shared returned by the lock allocation routines has reference semantics,
    (i.e. copying the pointer yields a reference to the same lock)
    but otherwise need not even be a real pointer. In other words, the thread affinity 
    and addrfield components of these pointers-to-shared is completely undefined,
    so casting them to a local pointer on _any_ thread may yield a pointer value which 
    doesn't point to a valid memory address (or points to a random object)
    this allows implementations which (for example) store an integer lock identifier 
     in the address field rather than a true pointer
 */

/* non-collective operation (intended to be called by a single thread)
     which dynamically allocates and initializes a lock,
     and returns a upcr_shared_ptr_t which references that lock.
   If insufficient resources are available, the function will print an implementation-defined 
     error message and terminate the job.
 */
upcr_shared_ptr_t upcr_global_lock_alloc();

/* collective operation which dynamically allocates and initializes a lock,
   and returns a upcr_shared_ptr_t which references that lock.
     * the function must be called by all threads during the same synchronization phase, 
     * may act as a barrier for all threads, but might not in some implementations
     * all threads receive a copy of the result, and the pointer-to-shared values will 
       compare equal (according to upcr_isequal_shared_shared()) on all threads
   If insufficient resources are available, the function will print an implementation-defined 
     error message and terminate the job.
 */
upcr_shared_ptr_t upcr_all_lock_alloc();

/* block until the referenced lock can be acquired by this thread
   if no other thread is currently holding or contending for the referenced lock, 
     this operation must return within a bounded amount of time
   implementations should attempt to provide fairness in the presence of 
     contention for this lock, but this property is not required
   if lockptr does not reference a valid lock object (i.e. one previously allocated by
     upcr_global_lock_alloc() or upcr_all_lock_alloc() and not deallocated using
     upcr_lock_free()) then the results are undefined
   if the current thread is already holding the referenced lock, the result is undefined
     (although implementations are recommended to print a useful error message and abort)
 */
void upcr_lock(upcr_shared_ptr_t lockptr);

/* attempt to acquire the referenced lock without blocking
   the operation always returns immediately, with the value 1 if the lock was
     successfully acquired, or with the value 0 if the lock could not be acquired at this time
   if no other thread is currently holding or contending for the referenced lock, 
     repeated calls to this operation will eventually succeed within a bounded amount of time
   if lockptr does not reference a valid lock object then the results are undefined
   if the current thread is already holding the referenced lock, the result is undefined
     (although implementations are encouraged to print a useful error message and abort)
 */
int upcr_lock_attempt(upcr_shared_ptr_t lockptr);

/* unlock the referenced lock 
   this operation releases the referenced lock, which must have been previously locked 
     by this thread using upcr_lock(), or a successful call to upcr_lock_attempt() 
     (otherwise the results are undefined)
   if lockptr does not reference a valid lock object then the results are undefined
   this operation always completes within a bounded amount of time
   implementations are encouraged to detect violations to the locking semantics
     (e.g. unlock with no matching lock) but this is not required
 */
void upcr_unlock(upcr_shared_ptr_t lockptr);

/* free a lock - non-collective operation
   this call (always made from a single thread) releases any system resources 
     associated with the referenced lock and makes the lock object "invalid" for all threads
   the lock need not have been explicitly created by the current thread (i.e. it may have
     been created by a call to upcr_global_lock_alloc() on a separate thread and passed to this one)
   any subsequent calls from any thread using this invalidated lock object have undefined effects 
   if lockptr does not reference a valid lock object then the results are undefined   
   this operation always completes within a bounded amount of time
   repeated calls to upcr_lock_free(upcr_global_lock_alloc()) must succeed indefinitely
     (i.e. it must actually reclaim any associated resources)
   the call will succeed immediately regardless of whether the referenced lock is currently 
     unlocked or currently locked (by any thread)
 */
void upcr_lock_free(upcr_shared_ptr_t lockptr);

/* free a lock - collective operation
   this call (made collectively by all threads) releases any system resources
     associated with the referenced lock and makes the lock object "invalid" for all threads
   the lock need not have been created collectively by a call to upcr_all_lock_alloc() (i.e. it
     may have been created by a call to upcr_global_lock_alloc() on a single thread)
   any subsequent calls from any thread using this invalidated lock object have undefined effects
   if lockptr does not reference a valid lock object then the results are undefined
   this operation always completes within a bounded amount of time
   the call will succeed immediately regardless of whether the referenced lock is currently
     unlocked or currently locked (by any thread)
   this call must be called collectively by all threads with a single-valued argument
   the lock remains valid until all threads have entered, but barrier synchronization is
     neither guaranteed nor prohibited.
 */
void upcr_all_lock_free(upcr_shared_ptr_t lockptr);

/* ------------------------------------------------------------------------------------ */
/* Statically-allocated user variables 
 * ===================================
 * The following interfaces provide portable support for statically-allocated user variables
 * (shared and unshared, scalar and array) 
 */

/*
 * Thread-Local Data (TLD)
 * =======================
 * Thread-local data (TLD) is defined to be any NON-shared, statically-allocated 
 * (i.e. not automatic lifetime) objects declared in UPC source files, namely non-shared 
 * file-scope (global) objects or static local variables (block-scope TLD).
 * TLD must be declared and accessed specially by generated code to ensure correct operation 
 * across the variety of platforms implementing the UPC runtime.
 *
 * The macros below must be used to declare all TLD - global or static user unshared
 * variables (unless they are declared with 'extern', or are located in a
 * regular C file (such as a header file with a name ending in '.h'), since if
 * pthreads are used, these variables will need to be made thread-specific.
 * Static variables need to be transform into global variables before this
 * macro can be used (and their names should be mangled to avoid name
 * collisions).
 *
 * Since uses of these macros are intended to be filterable by tools like
 * grep, they must be used at the start of a new line, and their contents
 * cannot contain line breaks.
 */

/* UPCR_TLD_DEFINE(name, size, align) must be used when declaring
 * unshared global/static variables that the user has initialized.  The
 * macro takes the variable name of the value, and the size and required
 * structure alignment of that type (in bytes, as a
 * single literal number--'sizeof', expressions like '3 + 4', etc., are not
 * allowed).  So the UPC compiler should transform 
 *
 *	int foo = 5;
 *
 * on a platform with 4 byte integers and 4-byte integer structure alignment into
 *
 *	int
 *	UPCR_TLD_DEFINE(foo, 4, 4) = 5;
 *
 * Unshared pointers-to-shared types (i.e. thread local variables
 * with type upcr_shared_ptr_t or upcr_pshared_ptr_t) should be initialized
 * with UPCR_INITIALIZED_{P}SHARED rather than the value the user specified.
 *
 * The full type of the variable must precede the macro, and so arrays and
 * function pointers must use a typedef.  For instance,
 *
 *     int natural_nums[3] = { 1, 2, 3};
 *     void (*int_taker)(int) = &print_int;
 *
 * Would become
 *
 *     typedef int _type_natural_nums[3];
 *     _type_natural_nums
 *     UPCR_TLD_DEFINE(natural_nums, 12, 4) = { 1, 2, 3 };
 *
 *     typedef void (*_type_int_taker)(int);
 *     _type_int_taker
 *     UPCR_TLD_DEFINE(int_taker, 4, 4) = &print_int;
 *
 * For variables that are not explictly initialized by the user,
 * UPCR_TLD_DEFINE_TENTATIVE(name, size, align) must be used.  The macro
 * works the same way as UPCR_TLD_DEFINE, except that it should not be
 * followed by "= initializer_expr."
 *
 * For more information on the uses of these macros, and the treatment of
 * thread-local data generally, see the web page on "static user data" in the
 * Runtime documentation on the Berkeley UPC web site.
 */

#define UPCR_TLD_DEFINE(name, size, align)
#define UPCR_TLD_DEFINE_TENTATIVE(name, size, align)

/* UPCR_TLD_ADDR: retrieve the address of the current thread's representative
   of the TLD variable with the given name (name must be a simple identifier)
   address is returned as a (void *) and should be cast to the proper type before use
 */
#define UPCR_TLD_ADDR(name) ???

/* Example usage: 
      int x = *(int*)UPCR_TLD_ADDR(foo);
      *(int*)UPCR_TLD_ADDR(foo) = 100;
      ((int*)UPCR_TLD_ADDR(natural))[2] = 27;
  Implementors note:
    UPCR_TLD_ADDR() returns an address rather than an l-value because some planned 
    implementations of TLD may not have the TLD type information available
    (TLD will just be opaque bytes in a special data segment)
*/

/*
  Statically-allocated Shared Data (SSD)
  ======================================
  Statically-allocated Shared Data (SSD) is defined to be any shared,
  statically-allocated (i.e. not automatic lifetime) objects declared in UPC
  source files, namely any shared file-scope (global) objects or static local
  variables (block-scope SSD).

  All SSD is allocated and initialized dynamically at runtime, instead of being
  truly statically allocated (since on most platforms network-addressable memory
  can not be assigned at compile time, and must be dynamically allocated).  The
  basic idea is the compiler replaces each SSD declaration with a
  upcr_shared_ptr or upcr_pshared_ptr that will point to the relevant data item
  at runtime (all SSD access operations must be modified appropriately to
  traverse the extra level of indirection).  The compiler also adds an
  allocation and an initialization function for each UPC file it compiles, in
  which all SSD declared in the file is allocated and inititalized (some
  thread-local data initializations may also be performed there).
  
  The functions listed below should only be used in these per-file startup
  allocation/initialization routines.  For more information on the naming
  conventions for these functions, the content that should go in them, and the
  framework that calls them, refer to the "Handling Static Data in the UPC
  Runtime" document (available in the documentation section of the Berkeley UPC
  website at http://upc.lbl.gov).
 */


/*
 * These values are guaranteed to be defined by every pointer-to-shared
 * representation.  UPCR_INITIALIZED_{P}SHARED should be used by the compiler
 * to initialize all upcr_shared_ptr_t and upcr_pshared_ptr_t's that represent
 * shared variables the user defines with an initial value (if the user does
 * not provide a value, do not provide any value for the upcr_{p}shared_ptr,
 * either).  UPCR_NULL_{P}SHARED should be used to initialize
 * upcr_{p}shared_ptr's that represent  unshared pointers-to-shared data that
 * the user explicitly initialized to NULL. 
 * Note these values are only guaranteed to work as variable initializer expressions,
 * and may not safely be used as the rhs for a general assignment statement
 * (upcr_setnull_(p)shared must be used for such applications)
 */
#define UPCR_INITIALIZED_SHARED	    { ??? }
#define UPCR_NULL_SHARED	    { ??? }
#define UPCR_INITIALIZED_PSHARED    { ??? }
#define UPCR_NULL_PSHARED	    { ??? }

/* Pointer-to-shared variables that contain NULL values.
 * Note that these can resolve to either a basic type or a struct (depending
 * on the pointer-to-shared representation), so code that uses them must work in
 * either case (eg. it would be illegal to use them in a context requiring a
 * scalar value, such as passing it to == operator.)
 */
const upcr_shared_ptr_t upcr_null_shared;
const upcr_pshared_ptr_t upcr_null_pshared;

/* 
 * This function will be provided by each pointer-to-shared representation,
 * and returns nonzero if the passed pointer is initialized to
 * UPCR_INITIALIZED_{P}SHARED.
 */
int upcr_is_init_shared(upcr_shared_ptr_t p);
int upcr_is_init_pshared(upcr_pshared_ptr_t p);

/* 
 * Allocation information struct for shared arrays that will be striped across
 * the UPC threads (with blocking size != 1 element): 
 *
 *  sptr_addr	    The address of the proxy upcr_shared_ptr_t for the memory
 *  blockbytes	    Size of each block in bytes
 *  numblocks       Number of blocks to allocate
 *  mult_by_threads Pass nonzero if numblocks should be multiplied by THREADS
 *
 *  Optional elements, used for instrumentation purposes only: (zero values permitted)
 *
 *  elemsz          Value of upc_elemsizeof for this object
 *  namestr         Shared object identifier name
 *  typestr         String encoding of type information for shared object
 *
 */
typedef struct {
   upcr_shared_ptr_t *sptr_addr;
   size_t blockbytes;
   size_t numblocks;
   int	  mult_by_threads;

   size_t elemsz;
   const char *namestr;
   const char *typestr;
} upcr_startup_shalloc_t;

/* 
 * Allocation information struct for indefinitely blocked (or blocksize == 1
 * element) shared arrays.
 *
 *  psptr_addr	The address of the proxy upcr_pshared_ptr_t for the memory
 *  blockbytes	Size of each block in bytes
 *  numblocks	Number of blocks to allocate
 *  mult_by_threads Pass nonzero if numblocks should be multiplied by THREADS
 *
 *  Optional elements, used for instrumentation purposes only: (zero values permitted)
 *
 *  elemsz          Value of upc_elemsizeof for this object
 *  namestr         Shared object identifier name
 *  typestr         String encoding of type information for shared object
 *
 */
typedef struct {
   upcr_pshared_ptr_t *psptr_addr;
   size_t blockbytes;
   size_t numblocks;
   int	  mult_by_threads;

   size_t elemsz;
   const char *namestr;
   const char *typestr;
} upcr_startup_pshalloc_t;

/*
 * Allocates the specified amount of memory for each pointer-to-shared in the
 * array of info structs.
 *
 * Only performs a given allocation if the memory has not already been allocated
 * for the pointer.  If the pointer was not initialized (i.e., is equal to 0
 * instead of UPCR_INITIALIZED_SHARED), any memory allocated is also memset
 * to 0.
 *
 * This function must be called by all threads collectively (like
 * upc_all_alloc, etc.).  The function does not guarantee that all threads
 * will have received the data when any particular thread
 * returns from the call (i.e. it does not guarantee a barrier is performed
 * after initialization).  The function does guarantee that it may be called
 * repeatedly without the need for client barrier calls to be placed in
 * between the calls. 
 *
 * See the upcr_startup_shalloc_t struct definition for options affecting how
 * memory is allocated.
 *
 */
void upcr_startup_shalloc(upcr_startup_shalloc_t *infos, size_t count);

/*
 * Allocates the specified amount of memory for each phaseless pointer-to-shared
 * in the array of info structs.
 *
 * Only performs a given allocation if the memory has not already been allocated
 * for the pointer.  If the pointer was not initialized (i.e., is equal to 0
 * instead of UPCR_INITIALIZED_PSHARED), any memory allocated is also memset
 * to 0.
 *
 * This function must be called by all threads collectively (like
 * upc_all_alloc, etc.).  When the function returns, the pointer-to-shared
 * pointed to by 'infos' will be initialized to the correct shared memory
 * location on all UPC threads.
 *
 * See the upcr_startup_shalloc_t struct definition for options affecting how
 * memory is allocated.
 */
void upcr_startup_pshalloc(upcr_startup_pshalloc_t *infos, size_t count);


/*
 * Information for a single dimension of a shared array initialization.
 *
 *	local_elems	    // Number of elements in local init array's dimension
 *	shared_elems	    // Number of elements in shared array's dimension
 *	mult_by_threads	    // Nonzero if shared array's dimension should be 
 *			         multipled by THREADS
 *
 * Note that the UPC language specification mandates that for a dynamic
 * translation environment (i.e. one in which THREADS is not a compile-time
 * constant) only one dimension of a shared array can contain THREADS, and it
 * can only be used once in that dimension, to multiply a constant size.
 */

typedef struct upcr_startup_arrayinit_diminfo {
    size_t local_elems;
    size_t shared_elems;
    int	   mult_by_threads;
} upcr_startup_arrayinit_diminfo_t;

/*
 * Initializes a shared array from a local array, or to 0s if NULL is passed
 * for the local array.
 *
 * This function is used to copy initial values from a local array (generated
 * by the UPC compiler) that contains any initial values provided by the user.
 * The local array does not need to have the same size as the shared array
 * (indeed, if the shared array contains THREADS in one of its dimensions, its
 * size is not even knowable at compile time).  It does, however, need to have
 * the same number of dimensions as the shared array, and the same element
 * size.  All values in the shared array that do not have corresponding values
 * in the local array are memset to 0.
 *
 * The function takes the addresses of the shared and local arrays, a pointer
 * to an array of structures (each of which describes a single dimension of
 * the array), a count of the number of dimensions in the array, the size (in
 * bytes) of the array's element type, and the blocking factor of the array
 * (as a number of elements).
 *
 * If NULL is passed for the local array address, all local array parameters
 * will be ignored, and the function will simply set all elements of the
 * shared array to 0. 
 *
 * Here is an example:
 *
 *  // in UPC program
 *
 *  shared [5] int j[3][4][2*THREADS] = {
 *      {
 *          { 1, 2 },
 *          { 3, 4 },
 *          { 5, 6 },
 *          { 1, 2, 3, 4, 5 }	// the user may specify extra elems if THREADS 
 *				// is part of the dimension
 *      }
 *  };
 *
 * Here the user has only provided a small subset of the inital values in the
 * array (even disregarding the THREADS in the final dimension).  The UPC
 * compiler should place the initial values into a [1][4][5] array, and then
 * setup and call the initialization function:
 *
 *  // output .c file, at file scope  
 *
 *  upcr_shared_ptr_t j = UPCR_INITIALIZED_SHARED;
 *
 *  int j_initarray[1][4][5] = {
 *      {
 *          { 1, 2 },
 *          { 3, 4 },
 *          { 5, 6 },
 *          { 1, 2, 3, 4, 5 }	
 *      }
 *  };
 *
 *  upcr_startup_arrayinit_diminfo_t j_diminfos[] = {
 *      { 1, 3, 0 },
 *      { 4, 4, 0 },
 *      { 5, 2, 1 }
 *  };
 *
 *  // In initialization function  
 *
 *  upcr_startup_initarray(&j, j_initarray, j_diminfos, 3, sizeof(int), 5);
 *
 * This function must be called collectively by each UPC thread for each array,
 * in the same order and with the same arguments.
 * The function does not guarantee that all threads will have completed their
 * initializations when any particular thread returns from the call (i.e. it
 * does not guarantee a barrier is performed after initialization).
 *
 * Implementation notes:
 * --------------------
 *
 * For efficiency, each thread should only copy elements that belong to its
 * portion of the shared array, so the function should not cause any network
 * traffic.  
 *
 * To save space, the local array's dimensions should only be as large
 * as needed to contain all the initial values specified by the user.  
 */
void upcr_startup_initarray(upcr_shared_ptr_t dst, void * src, 
		            upcr_startup_arrayinit_diminfo_t *diminfos, 
			    size_t dimcnt, size_t elembytes, size_t blockelems);

/*
 * Initializes a phaseless array from a local array, or to 0s if NULL passed.
 *
 * This function is identical to upcr_startup_initarray, except that it takes a
 * phaseless pointer-to-shared.
 *
 * For phaseless shared arrays with indefinite blocksize, pass '0' for the
 * 'blockelems' parameter.
 *
 * Implementor's note:  It should be possible to simply write this as an
 *			inline function that calls upcr_startup_initarray(),
 *			with upcr_pshared_to_shared() used to convert dst to
 *			the correct type.
 */
void upcr_startup_initparray(upcr_pshared_ptr_t dst, void * src, 
			    upcr_startup_arrayinit_diminfo_t *diminfos, 
			    size_t dimcnt, size_t elembytes, size_t blockelems);

/* ------------------------------------------------------------------------------------ */
/* UPC Thread Info
 * ==============-
 * The following provides an implementation of the upc_thread_info() function
 * defined in section 7.7.2.2 of the UPC 1.3 Optional Library Specification.
 *
 * upc_thread_info_t is a structure, defined in the upc_castable_bits.h header,
 * with (at least) the following defined fields:
 *    int guaranteedCastable;
 *    int probablyCastable;
 */

/*
 * Return a upc_thread_info_t structure with the guaranteedCastable and probablyCastable
 * fields initialized as described in the UPC 1.3 Optional Library Specification.
 * Implementation-specific additional fields, if any, are also initialized.
 * The behavior is undefined if (threadId >= upcr_threads()).
 */
upc_thread_info_t upcr_thread_info(size_t threadId);

/* ------------------------------------------------------------------------------------ */
/* Implementation Versioning
 * =========================
 * The following provide configuration and version information for an
 * implementation of this interface.
 */

/* 
 * A string representing all the relevant upcr configuration settings
 * that can be compared using string compare to verify version compatibility.
 * The string is also embedded into the library itself such that it can be
 * scanned for within a binary executable.
 */
#define UPCR_CONFIG_STRING "???" 

/* 
 * Integer values which correspond to the major and minor version numbers
 * for this specification, indicating the highest version level of 
 * runtime spec conformance which is implemented in a given runtime 
 * (available in specification version 3.8 and higher)
 */
#define UPCR_RUNTIME_SPEC_MAJOR x
#define UPCR_RUNTIME_SPEC_MINOR x

/* ------------------------------------------------------------------------------------ */
/* Entry-points in the upc_* Namespace
 * ===================================
 *
 * The following preprocessor definitions are in place during the compilation of translated code.
 * As this interface grows to include any later additions to the UPC specification's Required or
 * Optional libraries, they too will be made available in the upc_* namespace in this manner.
 * This ensures that addition of UPC library interfaces should not require any changes to the
 * source-to-source translator (unless specialized support is mandated).
 */

#define upc_global_exit         upcr_global_exit

#define upc_global_alloc        upcr_global_alloc
#define upc_all_alloc           upcr_all_alloc
#define upc_alloc               upcr_alloc
#define upc_free                upcr_free
#define upc_all_free            upcr_all_free

#define upc_threadof            upcr_threadof_shared
#define upc_phaseof             upcr_phaseof_shared
#define upc_addrfield           upcr_addrfield_shared
#define upc_affinitysize        upcr_affinitysize
#define upc_resetphase          upcr_shared_resetphase

#define upc_global_lock_alloc   upcr_global_lock_alloc
#define upc_all_lock_alloc      upcr_all_lock_alloc
#define upc_lock_free           upcr_lock_free
#define upc_all_lock_free       upcr_all_lock_free
#define upc_lock                upcr_lock
#define upc_lock_attempt        upcr_lock_attempt
#define upc_unlock              upcr_unlock

#define upc_memcpy              upcr_memcpy
#define upc_memput              upcr_memput
#define upc_memget              upcr_memget
#define upc_memset              upcr_memset

#define upc_thread_info         upcr_thread_info

/* ------------------------------------------------------------------------------------ */
