OpenMP in Xcode Speedup problems

Discussion in 'Mac Programming' started by gustavoQ, Jan 18, 2011.

  1. gustavoQ macrumors newbie

    Joined:
    Apr 15, 2010
    #1
    Hi everyone,

    i'm tying to do a program that contains parallel and sequential code just to compare the speed up. I already enable the OpenMP in my project and change the compiler version to LLVM GCC 4.2. But the sequential code are getting more speed. I have to do something else for the program work properly?

    I have the new Macbook Pro 13'' 2.4 GHz

    Thank you.
     
  2. lee1210 macrumors 68040

    lee1210

    Joined:
    Jan 10, 2005
    Location:
    Dallas, TX
    #2
    Posting some code might be helpful. Are you explicitly setting up the parallelism, or is the library/compiler supposed to be optimizing this for you?

    -Lee
     
  3. gustavoQ thread starter macrumors newbie

    Joined:
    Apr 15, 2010
    #3
    This is the code.

    Code:
    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    #include <math.h>
    #include <omp.h>
    #include "pgm.h"
    
    void edge_detector_parallel(PGMData *img){
    	int i,j,Gx,Gy;
    	int *aux;
    	
    	aux=(int *)malloc(img->height*img->width*sizeof(int));
    	
    	for (i=0;i<(img->height*img->width);i++)
    		aux[i]=0;
    	//omp_set_num_threads(10);
    
    	omp_set_num_threads(omp_get_num_procs());
    	
    	
    	printf("%d \n", omp_get_num_procs());
    #pragma omp parallel for private(i,j,Gx,Gy) shared(aux,img)
    	
    	for (i=1; i<(img->height-1); i++) {
    		for (j=1; j<(img->width-1); j++) {
    			
    			Gx=-img->image[(i-1)*img->width+(j-1)]-2*img->image[(i-1)*img->width+j]-img->image[(i-1)*img->width+(j+1)]+img->image[(i+1)*img->width+(j-1)]+2*img->image[(i+1)*img->width+j]+img->image[(i+1)*img->width+(j+1)];
    			Gy=-img->image[(i-1)*img->width+(j-1)]-2*img->image[i*img->width+(j-1)]-img->image[(i+1)*img->width+(j-1)]+img->image[(i-1)*img->width+(j+1)]+2*img->image[i*img->width+(j+1)]+img->image[(i+1)*img->width+(j+1)];
    			
    			aux[i*img->width+j]=(int)sqrt((Gx*Gx)+(Gy*Gy));
    			
    		}
    	}
    	
    	img->image=aux;
    }
    
    void edge_detector(PGMData *img){
    	int i,j,Gx,Gy;
    	int *aux;
    	
    	aux=(int *)malloc(img->height*img->width*sizeof(int));
    	
    	for (i=0;i<(img->height*img->width);i++)
    		aux[i]=0;
    	
    	for (i=1; i<(img->height-1); i++) {
    		for (j=1; j<(img->width-1); j++) {
    			
    			Gx=-img->image[(i-1)*img->width+(j-1)]-2*img->image[(i-1)*img->width+j]-img->image[(i-1)*img->width+(j+1)]+img->image[(i+1)*img->width+(j-1)]+2*img->image[(i+1)*img->width+j]+img->image[(i+1)*img->width+(j+1)];
    			Gy=-img->image[(i-1)*img->width+(j-1)]-2*img->image[i*img->width+(j-1)]-img->image[(i+1)*img->width+(j-1)]+img->image[(i-1)*img->width+(j+1)]+2*img->image[i*img->width+(j+1)]+img->image[(i+1)*img->width+(j+1)];
    			
    			aux[i*img->width+j]=(int)sqrt((Gx*Gx)+(Gy*Gy));
    			
    		}
    	}
    	
    	img->image=aux;
    }
    
    int main(){
    	double Tp,Ts;
    	
    	clock_t T1,T2;
    	
    	PGMData pic,aux;
    	
    	readPGM("Original.pgm",&pic);
    	
    	aux=pic;
    	
    	printf("Height: %d , Width: %d \n",pic.height,pic.width);
    	
    	T1=clock();
    	edge_detector_parallel(&pic);
    	T2=clock();
    	
    	Tp=((double)(T2-T1)/CLOCKS_PER_SEC);
    	
    	printf("Parallel %f \n",Tp);
    	
    	T1=clock();
    	edge_detector(&aux);
    	T2=clock();
    	
    	Ts=((double)(T2-T1)/CLOCKS_PER_SEC);
    	
    	printf("Sequential %f \n",Ts);
    	
    	printf("Speedup %f \n", (Ts/Tp));
    	
    	writePGM("Final.pgm",&pic);
    	
    	return 0;
    }
    
    pgm.h code.

    Code:
    #ifndef INC_PGM
    #define INC_PGM
    #include <stdio.h>
    #include <stdlib.h>
    #include <ctype.h>
    #include <string.h>
    
    #define HI(num)	(((num) & 0x0000FF00) >> 8)
    #define LO(num)	((num) & 0x000000FF)
    
    //PODATKOVNE STRUKTURE
    //Podatkovna struktura, ki hrani podatke o sliki
    typedef struct {
    	    int height;		//viöina slike
    	    int width;		//öirina slike
    	    int max_gray;	//maksimalna vrednost sivin
    	    int *image;		//polje pikslov na hostu
    } PGMData;
    //--------------------------------------------------------//
    //FUNKCIJE
    //Funkcija prebere sliko z imenom filename in jo zapiöe v podatkovno strukturo tipa PGMData
    PGMData* readPGM(const char *filename, PGMData *data);
    
    //Funkcija zapiöe sliko, ki se nahaja v podatkovni stukturi tipa PGMData v datoteko z imenom filename
    void writePGM(const char *filename, const PGMData *data);
    //--------------------------------------------------------//
    
    
    
    void SkipComments(FILE *fp)
    	{
    	    int ch;
    	    char line[100];
    	 
    	    while ((ch = fgetc(fp)) != EOF && isspace(ch))
    	        ;
    	    if (ch == '#') {
    	        fgets(line, sizeof(line), fp);
    	        SkipComments(fp);
    	    } else
    	        fseek(fp, -1, SEEK_CUR);
    	}
    
    PGMData* readPGM(const char *filename, PGMData *data)
    {
    	FILE *pgmFile;
    	char version[3];
    	int i, j;
    	int lo, hi;
    
    	pgmFile = fopen(filename, "rb");
    	if (pgmFile == NULL) {
    		perror("Cannot open file to read");
    		exit(EXIT_FAILURE);
    	}
    
    	fgets(version, sizeof(version), pgmFile);
    	if (strcmp(version, "P5")) {
    		fprintf(stderr, "Wrong file type!\n");
    		exit(EXIT_FAILURE);
    	}
    
    	SkipComments(pgmFile);
    	fscanf(pgmFile, "%d", &data->width);
    	SkipComments(pgmFile);
    	fscanf(pgmFile, "%d", &data->height);
    	SkipComments(pgmFile);
    	fscanf(pgmFile, "%d", &data->max_gray);
    	fgetc(pgmFile);
    
    	data->image = (int *)malloc(data->height*data->width*sizeof(int));
    	if (data->max_gray > 255)
    		for (i = 0; i < data->height; ++i)
    			for (j = 0; j < data->width; ++j) {
    				hi = fgetc(pgmFile);
    				lo = fgetc(pgmFile);
    				data->image[i*data->width+j] = (hi << 8) + lo;
    			}
    	else
    		for (i = 0; i < data->height; ++i)
    			for (j = 0; j < data->width; ++j) {
    				lo = fgetc(pgmFile);
    				data->image[i*data->width+j] = lo;
    			}
    	fclose(pgmFile);
    	return data;
    
    }
    
    void writePGM(const char *filename, const PGMData *data)
    {
    	FILE *pgmFile;
    	int i, j;
    	int hi, lo;
    	int width=(data->width)-1;
    	pgmFile = fopen(filename, "wb");
    	if (pgmFile == NULL) {
    		perror("Cannot open file to write");
    		exit(EXIT_FAILURE);
    	}
    
    	fprintf(pgmFile, "P5 ");
    	fprintf(pgmFile, "%d %d ", data->width, data->height);
    	fprintf(pgmFile, "%d ", data->max_gray);
    	if (data->max_gray > 255) {
    		for (i = 0; i < data->height; ++i) {
    			for (j = 0; j < data->width; ++j) {
    				hi = HI(data->image[i*data->width+j]);
    				lo = LO(data->image[i*data->width+j]);
    				fputc(hi, pgmFile);
    				fputc(lo, pgmFile);
    			}
    
    		}
    	} else {
    		for (i = 0; i < data->height; ++i)
    			for (j = 0; j < data->width; ++j) {
    				lo = LO(data->image[i*data->width+j]);
    				fputc(lo, pgmFile);
    			}
    	}
    
    	fclose(pgmFile);
    	free(data->image);
    }
    #endif
    this program only accept pgm images. The program detect edges.
     
  4. lee1210 macrumors 68040

    lee1210

    Joined:
    Jan 10, 2005
    Location:
    Dallas, TX
    #4
    I should probably keep my mouth shut here since i'm not familiar with OpenMP, but I am guessing that saying aux and img are "shared" means that there's some sort of coherency checking before they're accessed or at least before they are modified. As such, there are 24 accesses to img for each iteration of your loop, and one modification of aux. If the coherency checking isn't specific to an array position and controls any access anywhere in the arrays, then you're going to be colliding frequently. Between these collisions and the overhead of coherency checking, this is probably where things are going bad.

    I guess it would be nice to attempt a way to solve this. I'm not familiar at all with this algorithm and don't really know when i*img->width+j might collide, etc. so I'm not going to be much help. Just for kicks I looked up the pragmas and I think your threads might be fighting and both doing all of the work. I think you may need an omp for pragma to actually dole the work out. Again, i could be mistaken due to inexperience, but the omp parallel doesn't seem to be enough on its own:
    "Each thread ... executes all statements ... except for work-sharing constructs". Maybe a for-loop is a work-sharing construct, but i don't know why there would be a special pragma for a for-loop if so.

    Good luck. Hopefully someone with more experience can help you further.

    -Lee
     
  5. gustavoQ thread starter macrumors newbie

    Joined:
    Apr 15, 2010
    #5
    But the problem is i try this program on a PC that isn't a Macbook, and he actually have a speed up. This is why i don't know if the problem is in the code or the computer.
     
  6. chown33 macrumors 604

    Joined:
    Aug 9, 2009
    Location:
    Sailing beyond the sunset
    #6
    How many cores does the other computer have?

    Concurrency locks for single-core processors are faster than those for multi-core processors.

    What OS is the other computer running?
    Which version of OpenMP is it using?
    Which version is the Mac using?
    Post the output produced by the parallel run and the sequential run.

    I have no experience with OpenMP. These are just basic questions.
     
  7. gustavoQ thread starter macrumors newbie

    Joined:
    Apr 15, 2010
    #7
    I don't no if the OpenMP is working properly in my Mac.
     
  8. chown33 macrumors 604

    Joined:
    Aug 9, 2009
    Location:
    Sailing beyond the sunset
    #8
    That's not the same thing as your test program.

    Have you actually compiled and run your test program on the Windows XP machine?
     
  9. gustavoQ thread starter macrumors newbie

    Joined:
    Apr 15, 2010
    #9
    Of course i complied and run in Windows XP. That's why i'm posting, because in different machines i'm getting very different results. In mac it should be some improve on speed up. I have 2 cores too.
     
  10. chown33 macrumors 604

    Joined:
    Aug 9, 2009
    Location:
    Sailing beyond the sunset
    #10
    Then doesn't it make sense to post both results?

    Please post the output from your program when it's run on XP.
     
  11. gustavoQ thread starter macrumors newbie

    Joined:
    Apr 15, 2010
    #11
    So the results on Windows XP PC are:

    Height: 407 , Width: 600
    Parallel 0.015000
    Sequential 0.032000
    Speedup 2.133333
    Press any key to continue . . .
     
  12. lee1210 macrumors 68040

    lee1210

    Joined:
    Jan 10, 2005
    Location:
    Dallas, TX
    #12
    I call foul. More than 2x speedup? With only 2x the cores? Is this the same machine booted in windows and OS X?

    -Lee
     
  13. gustavoQ thread starter macrumors newbie

    Joined:
    Apr 15, 2010
    #13
    No, is a different machine with the processor that I mentioned.

    Probably the code is not in proper manner, but he is doing the edge detecting right.
     
  14. chown33 macrumors 604

    Joined:
    Aug 9, 2009
    Location:
    Sailing beyond the sunset
    #14
    What other OpenMP code have you tried?

    Have you tried any of the examples here:
    https://computing.llnl.gov/tutorials/openMP/exercise.html

    It's one of many results that appear when googling openMP examples

    I thought it was interesting because they also show programs with OpenMP bugs (and fixes). There's even a "When things go wrong..." heading at the bottom of the page.
     
  15. gustavoQ thread starter macrumors newbie

    Joined:
    Apr 15, 2010
    #15
    It's not possible that OSX, in some how, is blocking the velocity of the processor cores?

    I tryed with another code, this code i have sure is working well, because already pass in evaluation, and is happening the same thing. In Windows XP machine i have speedups and in Macbook i don't have.

    I tryed another thing, i install paralles than i use Windows XP and install Visual Studio, the same that the other machine have. And guess what i don't have the speedups.

    I don't understand...
     
  16. gnasher729 macrumors P6

    gnasher729

    Joined:
    Nov 25, 2005
    #16
    I suggest you open Terminal, enter "man clock", and wait for enlightenment. Read the explanation what clock returns, and read it very, very carefully.
     
  17. gustavoQ thread starter macrumors newbie

    Joined:
    Apr 15, 2010
    #17
    This is the code that i'm talking about, is more easy to test that the first one.
    The program consist on calculating the pi number.

    Code:
    #include <stdlib.h>
    #include <stdio.h>
    #include <math.h>
    #include <omp.h>
    #include <time.h>
    
    #define iter 10000000
    
    double sequencial();
    double parallel();
    
    int main (int argc, char* argv[])
    {
    	double ts,tp;
    	
    	ts=sequencial();
    	tp=parallel();
    	
    	printf("SPEEDUP = %f \n",ts/tp);
    	return 0;
    }
    
    double sequencial()
    {
    	double x,y,z;
    	int i;
    	double numb=0,finish;
    	int count=0;
    	
    	printf("Sequencial\n");
    	
    	clock_t start = clock();
    	
    	srand(time(NULL));
    	
    	for(i=0; i<iter; i++)
    	{
    		x = (double)rand()/RAND_MAX;
    		y = (double)rand()/RAND_MAX;
    		
    		z = (x*x)+(y*y);
    		
    		if(z<1) count++;
    	}
    	
    	printf("%d \n",count);
    	
    	numb=(double)(4*count)/iter;
    	finish=((double)clock() - start)/CLOCKS_PER_SEC;
    	
    	printf("pi = %f, time = %f \n",numb,finish);
    	
    	return finish;
    }
    
    double parallel()
    {
    	double x,y,z;
    	int i;
    	double numb=0,finish;
    	int count=0;
    	
    	printf("Parallel\n");
    	
    	clock_t start = clock();
    	
    	srand(time(NULL));
    	
    	omp_set_num_threads(omp_get_num_procs());
    	
    #pragma omp parallel for reduction(+:count) private(x,y,z)
    	
    	for(i=0; i<iter; i++)
    	{
    		x = (double)rand()/RAND_MAX;
    		y = (double)rand()/RAND_MAX;
    		
    		z = (x*x)+(y*y);
    		
    		//#pragma omp critical
    		
    		if(z<1) count++;
    	}
    	
    	
    	
    	printf("%d \n",count);
    	
    	numb=(double)(4*count)/iter;
    	finish=((double)clock() - start)/CLOCKS_PER_SEC;
    	
    	printf("pi = %f, time = %f \n",numb,finish);
    	
    	return finish;
    }
    The speedups in windows XP machine are around in 1.8 and 1.9.
     
  18. gnasher729 macrumors P6

    gnasher729

    Joined:
    Nov 25, 2005
    #18
    You didn't read the man page for clock () on MacOS X, did you?

    Hint: I think you are making a completely stupid mistake, which is as usual in programming nowhere near where you think it is, and if you read that man page then you will figure it out yourself.

    Open Terminal.
    Type "man clock".
    Read it carefully.
     
  19. gustavoQ thread starter macrumors newbie

    Joined:
    Apr 15, 2010
    #19
    i already read.

    You say to read the manclock () because of its description?

    I have a process (program) that run sequential em parallel in same time. What you are trying to say is that I should run separately ?

    If isn't that I don't know what I have to understand in the man.
     
  20. lee1210 macrumors 68040

    lee1210

    Joined:
    Jan 10, 2005
    Location:
    Dallas, TX
    #20
  21. chown33 macrumors 604

    Joined:
    Aug 9, 2009
    Location:
    Sailing beyond the sunset
    #21
    Processor time is not the same as wall-clock time. The clock() function on Mac OS X measures processor time. The clock() function on Windows measures wall-clock time (elapsed real time).

    In general, a parallel solution will use the same amount of processor-time as a sequential solution. It's faster only because it's doing multiple parts at once, not because it's using less total processor time. Two processors takes half the time, but the total processor time consumed is (relatively) constant.


    As an experiment, use clock() to measure the time it takes for sleep(5) on Mac OS X and XP. It should take very little processor time on Mac OS X, even though the program is clearly sleeping for 5 seconds.

    To correctly measure elapsed real time, you'll need to use the time() and difftime() C functions.



    FWIW, it looks to me like the Windows function is non-compliant. Everything I've seen, including a reliable reference book from 1992, says that clock() measures processor-time, not wall-clock time.
     
  22. gnasher729, Jan 20, 2011
    Last edited: Jan 20, 2011

    gnasher729 macrumors P6

    gnasher729

    Joined:
    Nov 25, 2005
    #22
    Let's say your single threaded code takes ten seconds of CPU time. It finishes after ten seconds. clock () returns 10 seconds.

    Your code using two threads takes five seconds of CPU time on two CPUs. It finishes after five seconds. clock () returns 5 x 2 = 10.

    Your code using 8 threads on an eight core machine takes 1.25 seconds of CPU time on eight CPUs. It finishes after 1.25 seconds. clock () returns 1.25 x 8 = 10.

    It seems that the implementation of clock () on your Windows machine is not Posix conforming. Run your code using a stop watch and see what happens.

    On a 12 core Mac Pro, you can use twelve seconds of CPU time every for second of real time. clock () will return 12 seconds for every second that your code runs on twelve cores.

    Just excellent how the "Return value" and "Remarks" sections in the Microsoft documentation say completely contradictory things.
     

Share This Page