Academic Portfolio
of
Computing
Please consider these are intended to be small samples of my style, training, and work. In most cases, full solutions are intentionally left out to retain the integrity of the courses.
Area | Description |
---|---|
Parallel Computing | Undergrad level matrix multiply in CUDA. |
Parallel Computing | Merge Sort optimized for large tasks on OMP. |
High Performance Computing | 2016 paper review of Atmospheric Science-based PDE/HPC implementation which I really enjoyed. |
Algorithms/Data Structures. | Binary Search Tree in Java. |
Assembler | Undergrad homework: Assembler Parsing |
Matrix Multiply in C++/CUDA
This assignment explored improving performance of vector addition and matrix multiplication by coalescing memory access and optimizing parallelization.
// Loop over all sub matrices in block_row of A and block_col of B
// required to compute Csub. Block multiply each pair of sub matrices
// and accumulate results
for (int m = 0; m < (A.width / FOOTPRINT_SIZE); ++m){
// Get Asub and Bsub descriptors
Asub = &A.elements[A.stride * FOOTPRINT_SIZE * block_row + FOOTPRINT_SIZE * m];
Bsub = &B.elements[B.stride * FOOTPRINT_SIZE * m + FOOTPRINT_SIZE * block_col];
// Copy ELEMENTS OF ASub and Bsub into shared memory
// EACH THREAD loads ONE ELEMENT of ASub and ONE of Bsub
// Notice: it does not need to be the element it requires to
// compute its Cvalue, as long as all elements are
// collaboratively read.
// Notice: every thread declares shared_A and shared_B in shared memory
// even though a thread block has only one shared_A and one shared_B
__shared__ float shared_A[FOOTPRINT_SIZE][FOOTPRINT_SIZE];
__shared__ float shared_B[FOOTPRINT_SIZE][FOOTPRINT_SIZE];
// Each thread copies a column of 4 elements from shared_A and the same from shared_B
for (
shared_A[thread_row*2][thread_col*2] = Asub[2*thread_row * A.stride + 2*thread_col];
shared_B[thread_row*2][thread_col*2] = Bsub[2*thread_row * B.stride + 2*thread_col];
shared_A[thread_row*2+1][thread_col*2] = Asub[(2*thread_row+1) * A.stride + 2*thread_col];
shared_B[thread_row*2+1][thread_col*2] = Bsub[(2*thread_row+1) * B.stride + 2*thread_col];
shared_A[thread_row*2][thread_col*2+1] = Asub[2*thread_row * A.stride + 2*thread_col+1];
shared_B[thread_row*2][thread_col*2+1] = Bsub[2*thread_row * B.stride + 2*thread_col+1];
shared_A[thread_row*2+1][thread_col*2+1] = Asub[(2*thread_row+1) * A.stride + 2*thread_col+1];
shared_B[thread_row*2+1][thread_col*2+1] = Bsub[(2*thread_row+1) * B.stride + 2*thread_col+1];
// Synchronize to ensure all elements are read
__syncthreads();
// Do an inproduct of one row of shared_A and one col of shared_B
// computing one Cvalue by accumulation
#pragma unroll
for(int e=0; e
Merge Sort in C/OMP
#include
#include
#include
#include "timer.h"
mergesort(int a[],int temp[], int low, int high) {
int mid;
if(low1000000)
mergesort(a,temp,low,mid); //sort the first half
#pragma omp task if (high-low>1000000)
mergesort(a,temp,mid+1,high); //sort the second half
#pragma omp taskwait
merge(a,temp,low,high,mid); //merge them togeather into one sorted list
}
}
merge(int a[],int temp[], int low, int high, int mid){
int i, j, k;
i=low; j=mid+1; k=low;
while((i<=mid)&&(j<=high)){
if(a[i] 1 ) LEN = atoi(argv[1]);
int i, *x,*temp;
x=(int *)malloc(sizeof(int)*LEN);
temp=(int *)malloc(sizeof(int)*LEN);
if(x==NULL || temp == NULL){
printf("Out of memory"); exit(0);
}
//Fill the array to be sorted with random numbers
for (i = 0; i < LEN; i++)
x[i] = rand() % LEN;
#ifdef DEBUG
printf("before sort:\n");
for (i = 0; i < LEN; i++) printf("%d ", x[i]);
printf("\n");
#endif
initialize_timer (); //We are just timing the sort
start_timer();
#pragma omp parallel
{
#pragma omp single
mergesort(x,temp,0, (LEN-1));
}
stop_timer();
time=elapsed_time();
#ifdef DEBUG
printf("after sort:\n");
for (i = 0; i < LEN; i++) printf("%d ", x[i]);
printf("\n");
#endif
printf("elapsed time = %lf (sec)\n", time);
return 0;
Binary Search Tree in Java
public Boolean postorderEval(TreeNode node, BST symTable) throws BSTException{
// evaluate left tree
// evaluate right tree (if not null)
// evaluate operator in node and return Boolean result
Boolean leftResult = null;
Boolean rightResult = null;
Boolean nodeResult = null;
if (node.getLeft() != null)
leftResult = postorderEval(node.getLeft(), symTable);
if (node.getRight() != null)
rightResult = postorderEval(node.getRight(), symTable);
switch (node.getItem()) {
case "true": nodeResult = true;
break;
case "false": nodeResult = false;
break;
case "not": nodeResult = !leftResult;
break;
case "and": nodeResult = (leftResult && rightResult);
break;
case "or": nodeResult = (leftResult || rightResult);
break;
default: String key = node.getItem();
nodeResult = symTable.retrieveItem(key).value();
break;
}
return nodeResult;
}
public static void main(String[] args){
System.out.println("Stepwise build infix expression: not not (true or false) and true");
Tree t = new Tree();
//BST symTab = new BST();
System.out.println("tree: "); t.preorderTraverse();
//System.out.println("result:\n" + t.postorderEval(symTab)+"\n");
TreeNode a = new TreeNode("true");
t = new Tree(a);
System.out.println("tree: "); t.preorderTraverse();
//System.out.println("result:\n" + t.postorderEval(symTab)+"\n");
TreeNode b = new TreeNode("false");
TreeNode or = new TreeNode("or", a, b);
t = new Tree(or);
System.out.println("tree: "); t.preorderTraverse();
//System.out.println("result:\n" + t.postorderEval(symTab)+"\n");
TreeNode not2 = new TreeNode("not", or);
t = new Tree(not2);
System.out.println("tree: "); t.preorderTraverse();
//System.out.println("result:\n" + t.postorderEval(symTab)+"\n");
TreeNode not = new TreeNode("not", not2);
TreeNode and = new TreeNode("and", not, new TreeNode("true"));
t = new Tree(and);
System.out.println("tree: "); t.preorderTraverse();
//System.out.println("result:\n" + t.postorderEval(symTab)+"\n");
}
Parsing in Assembler
line_info_t* asm_pass_one (const char* asm_file_name,
const char* sym_file_name) {
int line_no = 0;
char line[MAX_LINE_LENGTH];
const char* tok;
line_info_t* headInfo = NULL;
line_info_t* curInfo = NULL;
line_info_t* newInfo = NULL;
FILE* psym = NULL;
bool orig_found = false; // flag for .ORIG
bool end_found = false; // flag for .END
// open the source file and report any error
FILE *pasm = fopen(asm_file_name,"r");
if (pasm == NULL)
{ asm_error(ERR_OPEN_READ, asm_file_name);
exit(EXIT_FAILURE);
}
// read the lines one at a time using fgets()
while (fgets(line,sizeof(line),pasm) != NULL)
{ //printf("line is: %s",line);
line_no++;
if (sizeof(line) > MAX_LINE_LENGTH)
{ asm_error(ERR_LINE_TOO_LONG);
exit(EXIT_FAILURE);
}
// convert the line to a list of tokens
tok = tokenize_lc3_line(line);
//printf(" First token is: %s\n",tok);
// Skip comment-only lines, blank lines
if (tok)
{ // Some initial error checking for .ORIG and .END
if ((orig_found) && (util_get_opcode(tok) == OP_ORIG))
asm_error(ERR_MULTIPLE_ORIG);
if (!(orig_found))
{ if (util_get_opcode(tok) == OP_ORIG)
orig_found = true;
else
asm_error(ERR_ORIG_NOT_1ST);
}
if ((end_found) && (tok))
asm_error(ERR_END_NOT_LAST);
if ((!(end_found)) && (util_get_opcode(tok) == OP_END))
end_found = true;
// if there is an opcode (e.g. ADD) on the line, then
opcode_t opcode = util_get_opcode(tok);
if (opcode == OP_INVALID)
{ // check to see if it is a label
if (process_label(tok))
{ tok = next_token();
if (tok)
opcode = util_get_opcode(tok);
opcode = util_get_opcode(tok);
}
}
if (tok && opcode == OP_INVALID) // don't make this an else
asm_error(ERR_MISSING_OP,tok);
else if (tok) //found a valid operation, either as the first tok or after a label
{ //printf(" ...and it is a a valid opcode.\n");
// allocate/initialize a new line_info_t structure
newInfo = asm_init_line_info(NULL);
newInfo->opcode = opcode;
newInfo->lineNum = line_no;