macOS OpenMP in Xcode Speedup problems

gustavoQ · Jan 18, 2011

Hi everyone,

i'm tying to do a program that contains parallel and sequential code just to compare the speed up. I already enable the OpenMP in my project and change the compiler version to LLVM GCC 4.2. But the sequential code are getting more speed. I have to do something else for the program work properly?

I have the new Macbook Pro 13'' 2.4 GHz

Thank you.

lee1210 · Jan 18, 2011

Posting some code might be helpful. Are you explicitly setting up the parallelism, or is the library/compiler supposed to be optimizing this for you?

-Lee

gustavoQ · Jan 18, 2011

This is the code.

Code:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <omp.h>
#include "pgm.h"

void edge_detector_parallel(PGMData *img){
	int i,j,Gx,Gy;
	int *aux;
	
	aux=(int *)malloc(img->height*img->width*sizeof(int));
	
	for (i=0;i<(img->height*img->width);i++)
		aux[i]=0;
	//omp_set_num_threads(10);

	omp_set_num_threads(omp_get_num_procs());
	
	
	printf("%d \n", omp_get_num_procs());
#pragma omp parallel for private(i,j,Gx,Gy) shared(aux,img)
	
	for (i=1; i<(img->height-1); i++) {
		for (j=1; j<(img->width-1); j++) {
			
			Gx=-img->image[(i-1)*img->width+(j-1)]-2*img->image[(i-1)*img->width+j]-img->image[(i-1)*img->width+(j+1)]+img->image[(i+1)*img->width+(j-1)]+2*img->image[(i+1)*img->width+j]+img->image[(i+1)*img->width+(j+1)];
			Gy=-img->image[(i-1)*img->width+(j-1)]-2*img->image[i*img->width+(j-1)]-img->image[(i+1)*img->width+(j-1)]+img->image[(i-1)*img->width+(j+1)]+2*img->image[i*img->width+(j+1)]+img->image[(i+1)*img->width+(j+1)];
			
			aux[i*img->width+j]=(int)sqrt((Gx*Gx)+(Gy*Gy));
			
		}
	}
	
	img->image=aux;
}

void edge_detector(PGMData *img){
	int i,j,Gx,Gy;
	int *aux;
	
	aux=(int *)malloc(img->height*img->width*sizeof(int));
	
	for (i=0;i<(img->height*img->width);i++)
		aux[i]=0;
	
	for (i=1; i<(img->height-1); i++) {
		for (j=1; j<(img->width-1); j++) {
			
			Gx=-img->image[(i-1)*img->width+(j-1)]-2*img->image[(i-1)*img->width+j]-img->image[(i-1)*img->width+(j+1)]+img->image[(i+1)*img->width+(j-1)]+2*img->image[(i+1)*img->width+j]+img->image[(i+1)*img->width+(j+1)];
			Gy=-img->image[(i-1)*img->width+(j-1)]-2*img->image[i*img->width+(j-1)]-img->image[(i+1)*img->width+(j-1)]+img->image[(i-1)*img->width+(j+1)]+2*img->image[i*img->width+(j+1)]+img->image[(i+1)*img->width+(j+1)];
			
			aux[i*img->width+j]=(int)sqrt((Gx*Gx)+(Gy*Gy));
			
		}
	}
	
	img->image=aux;
}

int main(){
	double Tp,Ts;
	
	clock_t T1,T2;
	
	PGMData pic,aux;
	
	readPGM("Original.pgm",&pic);
	
	aux=pic;
	
	printf("Height: %d , Width: %d \n",pic.height,pic.width);
	
	T1=clock();
	edge_detector_parallel(&pic);
	T2=clock();
	
	Tp=((double)(T2-T1)/CLOCKS_PER_SEC);
	
	printf("Parallel %f \n",Tp);
	
	T1=clock();
	edge_detector(&aux);
	T2=clock();
	
	Ts=((double)(T2-T1)/CLOCKS_PER_SEC);
	
	printf("Sequential %f \n",Ts);
	
	printf("Speedup %f \n", (Ts/Tp));
	
	writePGM("Final.pgm",&pic);
	
	return 0;
}

pgm.h code.

Code:

#ifndef INC_PGM
#define INC_PGM
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

#define HI(num)	(((num) & 0x0000FF00) >> 8)
#define LO(num)	((num) & 0x000000FF)

//PODATKOVNE STRUKTURE
//Podatkovna struktura, ki hrani podatke o sliki
typedef struct {
	    int height;		//viöina slike
	    int width;		//öirina slike
	    int max_gray;	//maksimalna vrednost sivin
	    int *image;		//polje pikslov na hostu
} PGMData;
//--------------------------------------------------------//
//FUNKCIJE
//Funkcija prebere sliko z imenom filename in jo zapiöe v podatkovno strukturo tipa PGMData
PGMData* readPGM(const char *filename, PGMData *data);

//Funkcija zapiöe sliko, ki se nahaja v podatkovni stukturi tipa PGMData v datoteko z imenom filename
void writePGM(const char *filename, const PGMData *data);
//--------------------------------------------------------//



void SkipComments(FILE *fp)
	{
	    int ch;
	    char line[100];
	 
	    while ((ch = fgetc(fp)) != EOF && isspace(ch))
	        ;
	    if (ch == '#') {
	        fgets(line, sizeof(line), fp);
	        SkipComments(fp);
	    } else
	        fseek(fp, -1, SEEK_CUR);
	}

PGMData* readPGM(const char *filename, PGMData *data)
{
	FILE *pgmFile;
	char version[3];
	int i, j;
	int lo, hi;

	pgmFile = fopen(filename, "rb");
	if (pgmFile == NULL) {
		perror("Cannot open file to read");
		exit(EXIT_FAILURE);
	}

	fgets(version, sizeof(version), pgmFile);
	if (strcmp(version, "P5")) {
		fprintf(stderr, "Wrong file type!\n");
		exit(EXIT_FAILURE);
	}

	SkipComments(pgmFile);
	fscanf(pgmFile, "%d", &data->width);
	SkipComments(pgmFile);
	fscanf(pgmFile, "%d", &data->height);
	SkipComments(pgmFile);
	fscanf(pgmFile, "%d", &data->max_gray);
	fgetc(pgmFile);

	data->image = (int *)malloc(data->height*data->width*sizeof(int));
	if (data->max_gray > 255)
		for (i = 0; i < data->height; ++i)
			for (j = 0; j < data->width; ++j) {
				hi = fgetc(pgmFile);
				lo = fgetc(pgmFile);
				data->image[i*data->width+j] = (hi << 8) + lo;
			}
	else
		for (i = 0; i < data->height; ++i)
			for (j = 0; j < data->width; ++j) {
				lo = fgetc(pgmFile);
				data->image[i*data->width+j] = lo;
			}
	fclose(pgmFile);
	return data;

}

void writePGM(const char *filename, const PGMData *data)
{
	FILE *pgmFile;
	int i, j;
	int hi, lo;
	int width=(data->width)-1;
	pgmFile = fopen(filename, "wb");
	if (pgmFile == NULL) {
		perror("Cannot open file to write");
		exit(EXIT_FAILURE);
	}

	fprintf(pgmFile, "P5 ");
	fprintf(pgmFile, "%d %d ", data->width, data->height);
	fprintf(pgmFile, "%d ", data->max_gray);
	if (data->max_gray > 255) {
		for (i = 0; i < data->height; ++i) {
			for (j = 0; j < data->width; ++j) {
				hi = HI(data->image[i*data->width+j]);
				lo = LO(data->image[i*data->width+j]);
				fputc(hi, pgmFile);
				fputc(lo, pgmFile);
			}

		}
	} else {
		for (i = 0; i < data->height; ++i)
			for (j = 0; j < data->width; ++j) {
				lo = LO(data->image[i*data->width+j]);
				fputc(lo, pgmFile);
			}
	}

	fclose(pgmFile);
	free(data->image);
}
#endif

this program only accept pgm images. The program detect edges.

lee1210 · Jan 18, 2011

I should probably keep my mouth shut here since i'm not familiar with OpenMP, but I am guessing that saying aux and img are "shared" means that there's some sort of coherency checking before they're accessed or at least before they are modified. As such, there are 24 accesses to img for each iteration of your loop, and one modification of aux. If the coherency checking isn't specific to an array position and controls any access anywhere in the arrays, then you're going to be colliding frequently. Between these collisions and the overhead of coherency checking, this is probably where things are going bad.

I guess it would be nice to attempt a way to solve this. I'm not familiar at all with this algorithm and don't really know when i*img->width+j might collide, etc. so I'm not going to be much help. Just for kicks I looked up the pragmas and I think your threads might be fighting and both doing all of the work. I think you may need an omp for pragma to actually dole the work out. Again, i could be mistaken due to inexperience, but the omp parallel doesn't seem to be enough on its own:

When a parallel region is encountered, a logical team of threads is formed. Each thread in the team executes all statements within a parallel region except for work-sharing constructs. Work within work-sharing constructs is distributed among the threads in a team.

Loop iterations must be independent before the loop can be parallelized. An implied barrier exists at the end of a parallelized statement block.

"Each thread ... executes all statements ... except for work-sharing constructs". Maybe a for-loop is a work-sharing construct, but i don't know why there would be a special pragma for a for-loop if so.

Good luck. Hopefully someone with more experience can help you further.

-Lee

gustavoQ · Jan 18, 2011

But the problem is i try this program on a PC that isn't a Macbook, and he actually have a speed up. This is why i don't know if the problem is in the code or the computer.

chown33 · Jan 18, 2011

gustavoQ said:
But the problem is i try this program on a PC that isn't a Macbook, and he actually have a speed up. This is why i don't know if the problem is in the code or the computer.

How many cores does the other computer have?

Concurrency locks for single-core processors are faster than those for multi-core processors.

What OS is the other computer running?
Which version of OpenMP is it using?
Which version is the Mac using?
Post the output produced by the parallel run and the sequential run.

I have no experience with OpenMP. These are just basic questions.

gustavoQ · Jan 18, 2011

chown33 said:
How many cores does the other computer have?
2, is a Intel Pentium T2130
Concurrency locks for single-core processors are faster than those for multi-core processors.

What OS is the other computer running?
Windows XP
Which version of OpenMP is it using?
He are using Virtual Studio Professional 2008, enabling the OpenMP flag.
Which version is the Mac using?
OpenMP 3.0
Post the output produced by the parallel run and the sequential run.

The output in Mac is:

Height: 407 , Width: 600
2
Parallel 0.016386
Sequential 0.013773
Speedup 0.840535

I have no experience with OpenMP. These are just basic questions.

I don't no if the OpenMP is working properly in my Mac.

chown33 · Jan 18, 2011

gustavoQ said:
He are using Virtual Studio Professional 2008, enabling the OpenMP flag.

That's not the same thing as your test program.

Have you actually compiled and run your test program on the Windows XP machine?

gustavoQ · Jan 18, 2011

chown33 said:
That's not the same thing as your test program.

Have you actually compiled and run your test program on the Windows XP machine?

Of course i complied and run in Windows XP. That's why i'm posting, because in different machines i'm getting very different results. In mac it should be some improve on speed up. I have 2 cores too.

chown33 · Jan 18, 2011

gustavoQ said:
Of course i complied and run in Windows XP. That's why i'm posting, because in different machines i'm getting very different results.

Then doesn't it make sense to post both results?

Please post the output from your program when it's run on XP.

gustavoQ · Jan 18, 2011

So the results on Windows XP PC are:

Height: 407 , Width: 600
Parallel 0.015000
Sequential 0.032000
Speedup 2.133333
Press any key to continue . . .

lee1210 · Jan 18, 2011

gustavoQ said:
So the results on Windows XP PC are:

Height: 407 , Width: 600
Parallel 0.015000
Sequential 0.032000
Speedup 2.133333
Press any key to continue . . .

I call foul. More than 2x speedup? With only 2x the cores? Is this the same machine booted in windows and OS X?

-Lee

gustavoQ · Jan 18, 2011

lee1210 said:
I call foul. More than 2x speedup? With only 2x the cores? Is this the same machine booted in windows and OS X?

-Lee

No, is a different machine with the processor that I mentioned.

Probably the code is not in proper manner, but he is doing the edge detecting right.

chown33 · Jan 18, 2011

What other OpenMP code have you tried?

Have you tried any of the examples here:
https://computing.llnl.gov/tutorials/openMP/exercise.html

It's one of many results that appear when googling openMP examples

I thought it was interesting because they also show programs with OpenMP bugs (and fixes). There's even a "When things go wrong..." heading at the bottom of the page.

gustavoQ · Jan 20, 2011

It's not possible that OSX, in some how, is blocking the velocity of the processor cores?

I tryed with another code, this code i have sure is working well, because already pass in evaluation, and is happening the same thing. In Windows XP machine i have speedups and in Macbook i don't have.

I tryed another thing, i install paralles than i use Windows XP and install Visual Studio, the same that the other machine have. And guess what i don't have the speedups.

I don't understand...

gnasher729 · Jan 20, 2011

gustavoQ said:
It's not possible that OSX, in some how, is blocking the velocity of the processor cores?

I tryed with another code, this code i have sure is working well, because already pass in evaluation, and is happening the same thing. In Windows XP machine i have speedups and in Macbook i don't have.

I tryed another thing, i install paralles than i use Windows XP and install Visual Studio, the same that the other machine have. And guess what i don't have the speedups.

I don't understand...

I suggest you open Terminal, enter "man clock", and wait for enlightenment. Read the explanation what clock returns, and read it very, very carefully.

gustavoQ · Jan 20, 2011

This is the code that i'm talking about, is more easy to test that the first one.
The program consist on calculating the pi number.

Code:

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <time.h>

#define iter 10000000

double sequencial();
double parallel();

int main (int argc, char* argv[])
{
	double ts,tp;
	
	ts=sequencial();
	tp=parallel();
	
	printf("SPEEDUP = %f \n",ts/tp);
	return 0;
}

double sequencial()
{
	double x,y,z;
	int i;
	double numb=0,finish;
	int count=0;
	
	printf("Sequencial\n");
	
	clock_t start = clock();
	
	srand(time(NULL));
	
	for(i=0; i<iter; i++)
	{
		x = (double)rand()/RAND_MAX;
		y = (double)rand()/RAND_MAX;
		
		z = (x*x)+(y*y);
		
		if(z<1) count++;
	}
	
	printf("%d \n",count);
	
	numb=(double)(4*count)/iter;
	finish=((double)clock() - start)/CLOCKS_PER_SEC;
	
	printf("pi = %f, time = %f \n",numb,finish);
	
	return finish;
}

double parallel()
{
	double x,y,z;
	int i;
	double numb=0,finish;
	int count=0;
	
	printf("Parallel\n");
	
	clock_t start = clock();
	
	srand(time(NULL));
	
	omp_set_num_threads(omp_get_num_procs());
	
#pragma omp parallel for reduction(+:count) private(x,y,z)
	
	for(i=0; i<iter; i++)
	{
		x = (double)rand()/RAND_MAX;
		y = (double)rand()/RAND_MAX;
		
		z = (x*x)+(y*y);
		
		//#pragma omp critical
		
		if(z<1) count++;
	}
	
	
	
	printf("%d \n",count);
	
	numb=(double)(4*count)/iter;
	finish=((double)clock() - start)/CLOCKS_PER_SEC;
	
	printf("pi = %f, time = %f \n",numb,finish);
	
	return finish;
}

The speedups in windows XP machine are around in 1.8 and 1.9.

gnasher729 · Jan 20, 2011

gustavoQ said:
This is the code that i'm talking about, is more easy to test that the first one.
The program consist on calculating the pi number.

...
The speedups in windows XP machine are around in 1.8 and 1.9.

You didn't read the man page for clock () on MacOS X, did you?

Hint: I think you are making a completely stupid mistake, which is as usual in programming nowhere near where you think it is, and if you read that man page then you will figure it out yourself.

Open Terminal.
Type "man clock".
Read it carefully.

gustavoQ · Jan 20, 2011

gnasher729 said:
You didn't read the man page for clock () on MacOS X, did you?

Hint: I think you are making a completely stupid mistake, which is as usual in programming nowhere near where you think it is, and if you read that man page then you will figure it out yourself.

Open Terminal.
Type "man clock".
Read it carefully.

i already read.

DESCRIPTION
The clock() function determines the amount of processor time used since the invocation of
the calling process, measured in CLOCKS_PER_SECs of a second.

You say to read the manclock () because of its description?

I have a process (program) that run sequential em parallel in same time. What you are trying to say is that I should run separately ?

If isn't that I don't know what I have to understand in the man.

lee1210 · Jan 20, 2011

http://msdn.microsoft.com/en-us/library/4e2ess30(v=vs.71).aspx
vs.
http://www.manpagez.com/man/3/clock/

These things are not the same on the platforms in question. Read them both carefully. They are measuring different things.

-Lee

chown33 · Jan 20, 2011

gustavoQ said:
If isn't that I don't know what I have to understand in the man.

Processor time is not the same as wall-clock time. The clock() function on Mac OS X measures processor time. The clock() function on Windows measures wall-clock time (elapsed real time).

In general, a parallel solution will use the same amount of processor-time as a sequential solution. It's faster only because it's doing multiple parts at once, not because it's using less total processor time. Two processors takes half the time, but the total processor time consumed is (relatively) constant.

As an experiment, use clock() to measure the time it takes for sleep(5) on Mac OS X and XP. It should take very little processor time on Mac OS X, even though the program is clearly sleeping for 5 seconds.

To correctly measure elapsed real time, you'll need to use the time() and difftime() C functions.

FWIW, it looks to me like the Windows function is non-compliant. Everything I've seen, including a reliable reference book from 1992, says that clock() measures processor-time, not wall-clock time.

gnasher729 · Jan 20, 2011

gustavoQ said:
i already read.

You say to read the manclock () because of its description?

I have a process (program) that run sequential em parallel in same time. What you are trying to say is that I should run separately ?

If isn't that I don't know what I have to understand in the man.

Let's say your single threaded code takes ten seconds of CPU time. It finishes after ten seconds. clock () returns 10 seconds.

Your code using two threads takes five seconds of CPU time on two CPUs. It finishes after five seconds. clock () returns 5 x 2 = 10.

Your code using 8 threads on an eight core machine takes 1.25 seconds of CPU time on eight CPUs. It finishes after 1.25 seconds. clock () returns 1.25 x 8 = 10.

It seems that the implementation of clock () on your Windows machine is not Posix conforming. Run your code using a stop watch and see what happens.

On a 12 core Mac Pro, you can use twelve seconds of CPU time every for second of real time. clock () will return 12 seconds for every second that your code runs on twelve cores.

lee1210 said:
http://msdn.microsoft.com/en-us/library/4e2ess30(v=vs.71).aspx
vs.
http://www.manpagez.com/man/3/clock/

These things are not the same on the platforms in question. Read them both carefully. They are measuring different things.

-Lee

Just excellent how the "Return value" and "Remarks" sections in the Microsoft documentation say completely contradictory things.

macOS OpenMP in Xcode Speedup problems

macrumors newbie

macrumors 68040

macrumors newbie

macrumors 68040

macrumors newbie

Moderator

macrumors newbie

Moderator

macrumors newbie

Moderator

macrumors newbie

macrumors 68040

macrumors newbie

Moderator

macrumors newbie

Suspended

macrumors newbie

Suspended

macrumors newbie

macrumors 68040

Moderator

Suspended

Our Staff