Comparing two large binary files in C

up vote
4
down vote

favorite

I'm trying to read two sufficiently large binary files, comparing them and printing the offset at which they differ. I'm using fread to read the binary files and memcmp to compare them.

#include<stdio.h>
#include<time.h>

clock_t start, end;
int main(int argc, char *argv)

 double cpu_time_taken;
 FILE *fp1, *fp2;
 printf("nArgument count: %d", argc);
 printf("nFile 1 is: %s", argv[1]);
 printf("nFile 2 is: %sn", argv[2]);
 if (argc < 3)
 
 printf("nInsufficient Arguments: n");
 printf("nHelp:./executable <filename1> <filename2>n");
 return 0;
 
 else
 
 fp1 = fopen(argv[1], "rb");
 if (fp1 == NULL)
 
 printf("nError in opening file %s", argv[1]);
 return 0;
 

 fp2 = fopen(argv[2], "rb");

 if (fp2 == NULL)
 
 printf("nError in opening file %s", argv[2]);
 return 0;
 

 if ((fp1 != NULL) && (fp2 != NULL))
 
 start = clock();
 compare_two_binary_files(fp1, fp2);
 end = clock();
 cpu_time_taken = ((double) (end - start)) / CLOCKS_PER_SEC;
 printf("nTime taken to compare: %f", cpu_time_taken*1000);
 
 



int compare_two_binary_files(FILE *fp1, FILE *fp2)
 !feof(fp2))
 fread (tmp1, sizeof *tmp1, readsz, fp1);
 fread (tmp2, sizeof *tmp2, readsz, fp2);
 count += 16;
 if(memcmp(tmp1, tmp2, readsz))
 for(int i=0; i < readsz; i++)
 printf ("%d: 0x%02x ",i, tmp1[i]);
 
 printf("n%x", count);
 return 0;

edited 2 hours ago

200_success

125k14144402

asked 2 hours ago

ampat

211

New contributor

add a commentÂ |Â

up vote
4
down vote

favorite

I'm trying to read two sufficiently large binary files, comparing them and printing the offset at which they differ. I'm using fread to read the binary files and memcmp to compare them.

#include<stdio.h>
#include<time.h>

clock_t start, end;
int main(int argc, char *argv)

 double cpu_time_taken;
 FILE *fp1, *fp2;
 printf("nArgument count: %d", argc);
 printf("nFile 1 is: %s", argv[1]);
 printf("nFile 2 is: %sn", argv[2]);
 if (argc < 3)
 
 printf("nInsufficient Arguments: n");
 printf("nHelp:./executable <filename1> <filename2>n");
 return 0;
 
 else
 
 fp1 = fopen(argv[1], "rb");
 if (fp1 == NULL)
 
 printf("nError in opening file %s", argv[1]);
 return 0;
 

 fp2 = fopen(argv[2], "rb");

 if (fp2 == NULL)
 
 printf("nError in opening file %s", argv[2]);
 return 0;
 

 if ((fp1 != NULL) && (fp2 != NULL))
 
 start = clock();
 compare_two_binary_files(fp1, fp2);
 end = clock();
 cpu_time_taken = ((double) (end - start)) / CLOCKS_PER_SEC;
 printf("nTime taken to compare: %f", cpu_time_taken*1000);
 
 



int compare_two_binary_files(FILE *fp1, FILE *fp2)
 !feof(fp2))
 fread (tmp1, sizeof *tmp1, readsz, fp1);
 fread (tmp2, sizeof *tmp2, readsz, fp2);
 count += 16;
 if(memcmp(tmp1, tmp2, readsz))
 for(int i=0; i < readsz; i++)
 printf ("%d: 0x%02x ",i, tmp1[i]);
 
 printf("n%x", count);
 return 0;

edited 2 hours ago

200_success

125k14144402

asked 2 hours ago

ampat

211

New contributor

add a commentÂ |Â

up vote
4
down vote

favorite

I'm trying to read two sufficiently large binary files, comparing them and printing the offset at which they differ. I'm using fread to read the binary files and memcmp to compare them.

#include<stdio.h>
#include<time.h>

clock_t start, end;
int main(int argc, char *argv)

 double cpu_time_taken;
 FILE *fp1, *fp2;
 printf("nArgument count: %d", argc);
 printf("nFile 1 is: %s", argv[1]);
 printf("nFile 2 is: %sn", argv[2]);
 if (argc < 3)
 
 printf("nInsufficient Arguments: n");
 printf("nHelp:./executable <filename1> <filename2>n");
 return 0;
 
 else
 
 fp1 = fopen(argv[1], "rb");
 if (fp1 == NULL)
 
 printf("nError in opening file %s", argv[1]);
 return 0;
 

 fp2 = fopen(argv[2], "rb");

 if (fp2 == NULL)
 
 printf("nError in opening file %s", argv[2]);
 return 0;
 

 if ((fp1 != NULL) && (fp2 != NULL))
 
 start = clock();
 compare_two_binary_files(fp1, fp2);
 end = clock();
 cpu_time_taken = ((double) (end - start)) / CLOCKS_PER_SEC;
 printf("nTime taken to compare: %f", cpu_time_taken*1000);
 
 



int compare_two_binary_files(FILE *fp1, FILE *fp2)
 !feof(fp2))
 fread (tmp1, sizeof *tmp1, readsz, fp1);
 fread (tmp2, sizeof *tmp2, readsz, fp2);
 count += 16;
 if(memcmp(tmp1, tmp2, readsz))
 for(int i=0; i < readsz; i++)
 printf ("%d: 0x%02x ",i, tmp1[i]);
 
 printf("n%x", count);
 return 0;

edited 2 hours ago

200_success

125k14144402

asked 2 hours ago

ampat

211

New contributor

I'm trying to read two sufficiently large binary files, comparing them and printing the offset at which they differ. I'm using fread to read the binary files and memcmp to compare them.

#include<stdio.h>
#include<time.h>

clock_t start, end;
int main(int argc, char *argv)

 double cpu_time_taken;
 FILE *fp1, *fp2;
 printf("nArgument count: %d", argc);
 printf("nFile 1 is: %s", argv[1]);
 printf("nFile 2 is: %sn", argv[2]);
 if (argc < 3)
 
 printf("nInsufficient Arguments: n");
 printf("nHelp:./executable <filename1> <filename2>n");
 return 0;
 
 else
 
 fp1 = fopen(argv[1], "rb");
 if (fp1 == NULL)
 
 printf("nError in opening file %s", argv[1]);
 return 0;
 

 fp2 = fopen(argv[2], "rb");

 if (fp2 == NULL)
 
 printf("nError in opening file %s", argv[2]);
 return 0;
 

 if ((fp1 != NULL) && (fp2 != NULL))
 
 start = clock();
 compare_two_binary_files(fp1, fp2);
 end = clock();
 cpu_time_taken = ((double) (end - start)) / CLOCKS_PER_SEC;
 printf("nTime taken to compare: %f", cpu_time_taken*1000);
 
 



int compare_two_binary_files(FILE *fp1, FILE *fp2)
 !feof(fp2))
 fread (tmp1, sizeof *tmp1, readsz, fp1);
 fread (tmp2, sizeof *tmp2, readsz, fp2);
 count += 16;
 if(memcmp(tmp1, tmp2, readsz))
 for(int i=0; i < readsz; i++)
 printf ("%d: 0x%02x ",i, tmp1[i]);
 
 printf("n%x", count);
 return 0;

performance c file

edited 2 hours ago

200_success

125k14144402

asked 2 hours ago

ampat

211

New contributor

edited 2 hours ago

200_success

125k14144402

asked 2 hours ago

ampat

211

New contributor

edited 2 hours ago

200_success

125k14144402

edited 2 hours ago

200_success

125k14144402

edited 2 hours ago

200_success

125k14144402

asked 2 hours ago

ampat

211

New contributor

asked 2 hours ago

ampat

211

asked 2 hours ago

ampat

211

New contributor

ampat is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.

add a commentÂ |Â

4 Answers
4

active

oldest

votes

up vote
2
down vote

fread (tmp1, sizeof *tmp1, readsz, fp1);
fread (tmp2, sizeof *tmp2, readsz, fp2);
count += 16;
if(memcmp(tmp1, tmp2, readsz))
 Ã¢Â€Â¦

You are discarding the return values of the fread() calls, blindly assuming that they both successfully read 16 bytes.

It is unclear what the return value of compare_two_binary_files(Ã¢Â€Â¦, Ã¢Â€Â¦) means. In fact, you sometimes don't return a value at all. Your compiler should have warned you about that problem.

File I/O should be done a block at a time: 512 bytes, 1024 bytes, or 2048 bytes would be a more reasonable chunk size than 16.

Technically, most of the time is spent waiting for I/O. cpu_time_taken is a misnomer: what you're measuring is called "wall clock time".

answered 1 hour ago

200_success

125k14144402

add a commentÂ |Â

up vote
1
down vote

if ((fp1 != NULL) && (fp2 != NULL)) test is redundant. If one of them happened to be NULL, the program is already terminated.

Don't print error messages to the stdout. There is stderr for that purpose.

When printing to stdout, keep in mind that it is usually line buffered. The text stays in the internal buffer until a newline is printed. That's why it is important to print a newline after the message, not before.

printf("nError in opening file %s", argv[1]); doesn't tell the most important part: why did fopen fail. Print strerror(errno) as well. Combining the above bullets,
```
 fprintf(stderr, "Error in opening file %s: %sn", argv[1], strerror(errno));
```
You'd need to #include <errno.h> for that.

answered 1 hour ago

vnp

37k12992

"why it is important to print a newline after the message" is not necessarily sufficient. What are the rules of automatic flushing stdout buffer in C?. Consider fflush(stdout);.
â€“Â chux
1 hour ago

I like the idea of of informative error messages, yet printing a faulty file name is sometimes itself a problem. At a minimum, it is useful to include sententials to demarcate the iffy file name. file %s --> file "%s".
â€“Â chux
32 secs ago

add a commentÂ |Â

up vote
1
down vote

I'm trying to read two sufficiently large binary files, comparing them and printing the offset at which they differ.

compare_two_binary_files() is faulty for various reasons.

The offset in which they differ can readily exceed the int range. Recommend the widest integer available instead. File sizes are not limited by the integer types available, yet uintmax_t is a very reasonable assumption of being sufficient for a file size.

Checking feof() only ignores the possibility of a rare input error messing up the compare.

compare_two_binary_files() can return without a returning a value. A compiler warning should have occurred. Enable all warnings or use a better compiler.

Ignoring the return value of fread() is wrong.

if(memcmp(tmp1, tmp2, readsz)){ is questionable. The if() is true when the buffer differ.

Suggested alternative:

// Use something more generous than 16. Maybe even 4096 or 64k and allocate buffers
#define CMP_N 256

// Return value:
// 0: files compare equal in content and length, fp1 size saved as offset
// 1: files differ, fp1 longer, fp2 size saved as offset
// 2: files differ, fp2 longer, fp1 size saved as offset
// 3: files differ at offset
// -1: fp1 trouble reading. Unspecified data in offset
// -2: fp2 trouble reading. Unspecified data in offset
int compare_two_binary_files_alternate(FILE *fp1, FILE *fp2, uintmax_t *offset) 
 char tmp1[CMP_N], tmp2[CMP_N];
 size_t n1, n2;

 rewind(fp1); // start at beginning and clear error
 rewind(fp2);
 *offset = 0;
 do 
 n1 = fread(tmp1, sizeof *tmp1, sizeof tmp1 / sizeof *tmp1, fp1);
 if (n1 == 0 && ferror(fp1)) 
 return -1;
 
 n2 = fread(tmp2, sizeof *tmp2, sizeof tmp2 / sizeof *tmp2, fp2);
 if (n2 == 0 && ferror(fp2)) 
 return -2;
 
 size_t n_min = n1 < n2 ? n1 : n2;
 if (memcmp(tmp1, tmp2, n_min)) // Quickly find if file contents differ ...
 for (size_t i = 0; i < n_min; i++) // Slowly find where they differ
 if (tmp1[i] != tmp2[i]) 
 *offset += i;
 return 3;
 
 
 
 *offset += n_min;
 if (n1 > n_min) 
 return 1;
 
 if (n2 > n_min) 
 return 2;
 
 while (n1);
 return 0;

edited 12 mins ago

answered 27 mins ago

chux

11.7k11339

add a commentÂ |Â

up vote
0
down vote

In addition to what 200_success said:

"rb" is not a valid fopen flag. It most likely has the same effect as "r"

the return value of memcmp() is zero if the two are equal, so i am not sure what the print loop is doing, especially because it exits if the first 16 bytes are the same.

in general try not to print in calculation functions, instead use the return value to indicate a result.

16 is an arbitrary and very small block size to read. consider making it larger and storing it as a constant, rather than retyping it each time.

answered 1 hour ago

Peter

65812

2

Disagree with ""rb" is not a valid fopen flag." The C spec has "rb open binary file for reading".
â€“Â chux
1 hour ago

add a commentÂ |Â

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

ampat is a new contributor. Be nice, and check out our Code of Conduct.

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f204478%2fcomparing-two-large-binary-files-in-c%23new-answer', 'question_page');

);

Post as a guest

Name

4 Answers
4

active

oldest

votes

4 Answers
4

active

oldest

votes

up vote
2
down vote

fread (tmp1, sizeof *tmp1, readsz, fp1);
fread (tmp2, sizeof *tmp2, readsz, fp2);
count += 16;
if(memcmp(tmp1, tmp2, readsz))
 Ã¢Â€Â¦

You are discarding the return values of the fread() calls, blindly assuming that they both successfully read 16 bytes.

File I/O should be done a block at a time: 512 bytes, 1024 bytes, or 2048 bytes would be a more reasonable chunk size than 16.

Technically, most of the time is spent waiting for I/O. cpu_time_taken is a misnomer: what you're measuring is called "wall clock time".

answered 1 hour ago

200_success

125k14144402

add a commentÂ |Â

up vote
2
down vote

fread (tmp1, sizeof *tmp1, readsz, fp1);
fread (tmp2, sizeof *tmp2, readsz, fp2);
count += 16;
if(memcmp(tmp1, tmp2, readsz))
 Ã¢Â€Â¦

You are discarding the return values of the fread() calls, blindly assuming that they both successfully read 16 bytes.

File I/O should be done a block at a time: 512 bytes, 1024 bytes, or 2048 bytes would be a more reasonable chunk size than 16.

Technically, most of the time is spent waiting for I/O. cpu_time_taken is a misnomer: what you're measuring is called "wall clock time".

answered 1 hour ago

200_success

125k14144402

add a commentÂ |Â

up vote
2
down vote

fread (tmp1, sizeof *tmp1, readsz, fp1);
fread (tmp2, sizeof *tmp2, readsz, fp2);
count += 16;
if(memcmp(tmp1, tmp2, readsz))
 Ã¢Â€Â¦

You are discarding the return values of the fread() calls, blindly assuming that they both successfully read 16 bytes.

File I/O should be done a block at a time: 512 bytes, 1024 bytes, or 2048 bytes would be a more reasonable chunk size than 16.

Technically, most of the time is spent waiting for I/O. cpu_time_taken is a misnomer: what you're measuring is called "wall clock time".

answered 1 hour ago

200_success

125k14144402

fread (tmp1, sizeof *tmp1, readsz, fp1);
fread (tmp2, sizeof *tmp2, readsz, fp2);
count += 16;
if(memcmp(tmp1, tmp2, readsz))
 Ã¢Â€Â¦

You are discarding the return values of the fread() calls, blindly assuming that they both successfully read 16 bytes.

File I/O should be done a block at a time: 512 bytes, 1024 bytes, or 2048 bytes would be a more reasonable chunk size than 16.

Technically, most of the time is spent waiting for I/O. cpu_time_taken is a misnomer: what you're measuring is called "wall clock time".

answered 1 hour ago

200_success

125k14144402

answered 1 hour ago

200_success

125k14144402

answered 1 hour ago

200_success

125k14144402

answered 1 hour ago

200_success

125k14144402

add a commentÂ |Â

up vote
1
down vote

if ((fp1 != NULL) && (fp2 != NULL)) test is redundant. If one of them happened to be NULL, the program is already terminated.

Don't print error messages to the stdout. There is stderr for that purpose.

When printing to stdout, keep in mind that it is usually line buffered. The text stays in the internal buffer until a newline is printed. That's why it is important to print a newline after the message, not before.

printf("nError in opening file %s", argv[1]); doesn't tell the most important part: why did fopen fail. Print strerror(errno) as well. Combining the above bullets,
```
 fprintf(stderr, "Error in opening file %s: %sn", argv[1], strerror(errno));
```
You'd need to #include <errno.h> for that.

answered 1 hour ago

vnp

37k12992

"why it is important to print a newline after the message" is not necessarily sufficient. What are the rules of automatic flushing stdout buffer in C?. Consider fflush(stdout);.
â€“Â chux
1 hour ago

I like the idea of of informative error messages, yet printing a faulty file name is sometimes itself a problem. At a minimum, it is useful to include sententials to demarcate the iffy file name. file %s --> file "%s".
â€“Â chux
32 secs ago

add a commentÂ |Â

up vote
1
down vote

if ((fp1 != NULL) && (fp2 != NULL)) test is redundant. If one of them happened to be NULL, the program is already terminated.

Don't print error messages to the stdout. There is stderr for that purpose.

When printing to stdout, keep in mind that it is usually line buffered. The text stays in the internal buffer until a newline is printed. That's why it is important to print a newline after the message, not before.

printf("nError in opening file %s", argv[1]); doesn't tell the most important part: why did fopen fail. Print strerror(errno) as well. Combining the above bullets,
```
 fprintf(stderr, "Error in opening file %s: %sn", argv[1], strerror(errno));
```
You'd need to #include <errno.h> for that.

answered 1 hour ago

vnp

37k12992

"why it is important to print a newline after the message" is not necessarily sufficient. What are the rules of automatic flushing stdout buffer in C?. Consider fflush(stdout);.
â€“Â chux
1 hour ago

I like the idea of of informative error messages, yet printing a faulty file name is sometimes itself a problem. At a minimum, it is useful to include sententials to demarcate the iffy file name. file %s --> file "%s".
â€“Â chux
32 secs ago

add a commentÂ |Â

up vote
1
down vote

if ((fp1 != NULL) && (fp2 != NULL)) test is redundant. If one of them happened to be NULL, the program is already terminated.

Don't print error messages to the stdout. There is stderr for that purpose.

When printing to stdout, keep in mind that it is usually line buffered. The text stays in the internal buffer until a newline is printed. That's why it is important to print a newline after the message, not before.

printf("nError in opening file %s", argv[1]); doesn't tell the most important part: why did fopen fail. Print strerror(errno) as well. Combining the above bullets,
```
 fprintf(stderr, "Error in opening file %s: %sn", argv[1], strerror(errno));
```
You'd need to #include <errno.h> for that.

answered 1 hour ago

vnp

37k12992

if ((fp1 != NULL) && (fp2 != NULL)) test is redundant. If one of them happened to be NULL, the program is already terminated.

Don't print error messages to the stdout. There is stderr for that purpose.

When printing to stdout, keep in mind that it is usually line buffered. The text stays in the internal buffer until a newline is printed. That's why it is important to print a newline after the message, not before.

printf("nError in opening file %s", argv[1]); doesn't tell the most important part: why did fopen fail. Print strerror(errno) as well. Combining the above bullets,
```
 fprintf(stderr, "Error in opening file %s: %sn", argv[1], strerror(errno));
```
You'd need to #include <errno.h> for that.

answered 1 hour ago

vnp

37k12992

answered 1 hour ago

vnp

37k12992

answered 1 hour ago

vnp

37k12992

answered 1 hour ago

vnp

37k12992

"why it is important to print a newline after the message" is not necessarily sufficient. What are the rules of automatic flushing stdout buffer in C?. Consider fflush(stdout);.
â€“Â chux
1 hour ago

I like the idea of of informative error messages, yet printing a faulty file name is sometimes itself a problem. At a minimum, it is useful to include sententials to demarcate the iffy file name. file %s --> file "%s".
â€“Â chux
32 secs ago

add a commentÂ |Â

"why it is important to print a newline after the message" is not necessarily sufficient. What are the rules of automatic flushing stdout buffer in C?. Consider fflush(stdout);.
â€“Â chux
1 hour ago

I like the idea of of informative error messages, yet printing a faulty file name is sometimes itself a problem. At a minimum, it is useful to include sententials to demarcate the iffy file name. file %s --> file "%s".
â€“Â chux
32 secs ago

"why it is important to print a newline after the message" is not necessarily sufficient. What are the rules of automatic flushing stdout buffer in C?. Consider fflush(stdout);.
â€“Â chux
1 hour ago

I like the idea of of informative error messages, yet printing a faulty file name is sometimes itself a problem. At a minimum, it is useful to include sententials to demarcate the iffy file name. file %s --> file "%s".
â€“Â chux
32 secs ago

add a commentÂ |Â

up vote
1
down vote

I'm trying to read two sufficiently large binary files, comparing them and printing the offset at which they differ.

compare_two_binary_files() is faulty for various reasons.

The offset in which they differ can readily exceed the int range. Recommend the widest integer available instead. File sizes are not limited by the integer types available, yet uintmax_t is a very reasonable assumption of being sufficient for a file size.

Checking feof() only ignores the possibility of a rare input error messing up the compare.

compare_two_binary_files() can return without a returning a value. A compiler warning should have occurred. Enable all warnings or use a better compiler.

Ignoring the return value of fread() is wrong.

if(memcmp(tmp1, tmp2, readsz)){ is questionable. The if() is true when the buffer differ.

Suggested alternative:

// Use something more generous than 16. Maybe even 4096 or 64k and allocate buffers
#define CMP_N 256

// Return value:
// 0: files compare equal in content and length, fp1 size saved as offset
// 1: files differ, fp1 longer, fp2 size saved as offset
// 2: files differ, fp2 longer, fp1 size saved as offset
// 3: files differ at offset
// -1: fp1 trouble reading. Unspecified data in offset
// -2: fp2 trouble reading. Unspecified data in offset
int compare_two_binary_files_alternate(FILE *fp1, FILE *fp2, uintmax_t *offset) 
 char tmp1[CMP_N], tmp2[CMP_N];
 size_t n1, n2;

 rewind(fp1); // start at beginning and clear error
 rewind(fp2);
 *offset = 0;
 do 
 n1 = fread(tmp1, sizeof *tmp1, sizeof tmp1 / sizeof *tmp1, fp1);
 if (n1 == 0 && ferror(fp1)) 
 return -1;
 
 n2 = fread(tmp2, sizeof *tmp2, sizeof tmp2 / sizeof *tmp2, fp2);
 if (n2 == 0 && ferror(fp2)) 
 return -2;
 
 size_t n_min = n1 < n2 ? n1 : n2;
 if (memcmp(tmp1, tmp2, n_min)) // Quickly find if file contents differ ...
 for (size_t i = 0; i < n_min; i++) // Slowly find where they differ
 if (tmp1[i] != tmp2[i]) 
 *offset += i;
 return 3;
 
 
 
 *offset += n_min;
 if (n1 > n_min) 
 return 1;
 
 if (n2 > n_min) 
 return 2;
 
 while (n1);
 return 0;

edited 12 mins ago

answered 27 mins ago

chux

11.7k11339

add a commentÂ |Â

up vote
1
down vote

I'm trying to read two sufficiently large binary files, comparing them and printing the offset at which they differ.

compare_two_binary_files() is faulty for various reasons.

The offset in which they differ can readily exceed the int range. Recommend the widest integer available instead. File sizes are not limited by the integer types available, yet uintmax_t is a very reasonable assumption of being sufficient for a file size.

Checking feof() only ignores the possibility of a rare input error messing up the compare.

compare_two_binary_files() can return without a returning a value. A compiler warning should have occurred. Enable all warnings or use a better compiler.

Ignoring the return value of fread() is wrong.

if(memcmp(tmp1, tmp2, readsz)){ is questionable. The if() is true when the buffer differ.

Suggested alternative:

// Use something more generous than 16. Maybe even 4096 or 64k and allocate buffers
#define CMP_N 256

// Return value:
// 0: files compare equal in content and length, fp1 size saved as offset
// 1: files differ, fp1 longer, fp2 size saved as offset
// 2: files differ, fp2 longer, fp1 size saved as offset
// 3: files differ at offset
// -1: fp1 trouble reading. Unspecified data in offset
// -2: fp2 trouble reading. Unspecified data in offset
int compare_two_binary_files_alternate(FILE *fp1, FILE *fp2, uintmax_t *offset) 
 char tmp1[CMP_N], tmp2[CMP_N];
 size_t n1, n2;

 rewind(fp1); // start at beginning and clear error
 rewind(fp2);
 *offset = 0;
 do 
 n1 = fread(tmp1, sizeof *tmp1, sizeof tmp1 / sizeof *tmp1, fp1);
 if (n1 == 0 && ferror(fp1)) 
 return -1;
 
 n2 = fread(tmp2, sizeof *tmp2, sizeof tmp2 / sizeof *tmp2, fp2);
 if (n2 == 0 && ferror(fp2)) 
 return -2;
 
 size_t n_min = n1 < n2 ? n1 : n2;
 if (memcmp(tmp1, tmp2, n_min)) // Quickly find if file contents differ ...
 for (size_t i = 0; i < n_min; i++) // Slowly find where they differ
 if (tmp1[i] != tmp2[i]) 
 *offset += i;
 return 3;
 
 
 
 *offset += n_min;
 if (n1 > n_min) 
 return 1;
 
 if (n2 > n_min) 
 return 2;
 
 while (n1);
 return 0;

edited 12 mins ago

answered 27 mins ago

chux

11.7k11339

add a commentÂ |Â

up vote
1
down vote

I'm trying to read two sufficiently large binary files, comparing them and printing the offset at which they differ.

compare_two_binary_files() is faulty for various reasons.

The offset in which they differ can readily exceed the int range. Recommend the widest integer available instead. File sizes are not limited by the integer types available, yet uintmax_t is a very reasonable assumption of being sufficient for a file size.

Checking feof() only ignores the possibility of a rare input error messing up the compare.

compare_two_binary_files() can return without a returning a value. A compiler warning should have occurred. Enable all warnings or use a better compiler.

Ignoring the return value of fread() is wrong.

if(memcmp(tmp1, tmp2, readsz)){ is questionable. The if() is true when the buffer differ.

Suggested alternative:

// Use something more generous than 16. Maybe even 4096 or 64k and allocate buffers
#define CMP_N 256

// Return value:
// 0: files compare equal in content and length, fp1 size saved as offset
// 1: files differ, fp1 longer, fp2 size saved as offset
// 2: files differ, fp2 longer, fp1 size saved as offset
// 3: files differ at offset
// -1: fp1 trouble reading. Unspecified data in offset
// -2: fp2 trouble reading. Unspecified data in offset
int compare_two_binary_files_alternate(FILE *fp1, FILE *fp2, uintmax_t *offset) 
 char tmp1[CMP_N], tmp2[CMP_N];
 size_t n1, n2;

 rewind(fp1); // start at beginning and clear error
 rewind(fp2);
 *offset = 0;
 do 
 n1 = fread(tmp1, sizeof *tmp1, sizeof tmp1 / sizeof *tmp1, fp1);
 if (n1 == 0 && ferror(fp1)) 
 return -1;
 
 n2 = fread(tmp2, sizeof *tmp2, sizeof tmp2 / sizeof *tmp2, fp2);
 if (n2 == 0 && ferror(fp2)) 
 return -2;
 
 size_t n_min = n1 < n2 ? n1 : n2;
 if (memcmp(tmp1, tmp2, n_min)) // Quickly find if file contents differ ...
 for (size_t i = 0; i < n_min; i++) // Slowly find where they differ
 if (tmp1[i] != tmp2[i]) 
 *offset += i;
 return 3;
 
 
 
 *offset += n_min;
 if (n1 > n_min) 
 return 1;
 
 if (n2 > n_min) 
 return 2;
 
 while (n1);
 return 0;

edited 12 mins ago

answered 27 mins ago

chux

11.7k11339

I'm trying to read two sufficiently large binary files, comparing them and printing the offset at which they differ.

compare_two_binary_files() is faulty for various reasons.

The offset in which they differ can readily exceed the int range. Recommend the widest integer available instead. File sizes are not limited by the integer types available, yet uintmax_t is a very reasonable assumption of being sufficient for a file size.

Checking feof() only ignores the possibility of a rare input error messing up the compare.

compare_two_binary_files() can return without a returning a value. A compiler warning should have occurred. Enable all warnings or use a better compiler.

Ignoring the return value of fread() is wrong.

if(memcmp(tmp1, tmp2, readsz)){ is questionable. The if() is true when the buffer differ.

Suggested alternative:

// Use something more generous than 16. Maybe even 4096 or 64k and allocate buffers
#define CMP_N 256

// Return value:
// 0: files compare equal in content and length, fp1 size saved as offset
// 1: files differ, fp1 longer, fp2 size saved as offset
// 2: files differ, fp2 longer, fp1 size saved as offset
// 3: files differ at offset
// -1: fp1 trouble reading. Unspecified data in offset
// -2: fp2 trouble reading. Unspecified data in offset
int compare_two_binary_files_alternate(FILE *fp1, FILE *fp2, uintmax_t *offset) 
 char tmp1[CMP_N], tmp2[CMP_N];
 size_t n1, n2;

 rewind(fp1); // start at beginning and clear error
 rewind(fp2);
 *offset = 0;
 do 
 n1 = fread(tmp1, sizeof *tmp1, sizeof tmp1 / sizeof *tmp1, fp1);
 if (n1 == 0 && ferror(fp1)) 
 return -1;
 
 n2 = fread(tmp2, sizeof *tmp2, sizeof tmp2 / sizeof *tmp2, fp2);
 if (n2 == 0 && ferror(fp2)) 
 return -2;
 
 size_t n_min = n1 < n2 ? n1 : n2;
 if (memcmp(tmp1, tmp2, n_min)) // Quickly find if file contents differ ...
 for (size_t i = 0; i < n_min; i++) // Slowly find where they differ
 if (tmp1[i] != tmp2[i]) 
 *offset += i;
 return 3;
 
 
 
 *offset += n_min;
 if (n1 > n_min) 
 return 1;
 
 if (n2 > n_min) 
 return 2;
 
 while (n1);
 return 0;

edited 12 mins ago

answered 27 mins ago

chux

11.7k11339

edited 12 mins ago

answered 27 mins ago

chux

11.7k11339

answered 27 mins ago

chux

11.7k11339

answered 27 mins ago

chux

11.7k11339

add a commentÂ |Â

up vote
0
down vote

In addition to what 200_success said:

"rb" is not a valid fopen flag. It most likely has the same effect as "r"

the return value of memcmp() is zero if the two are equal, so i am not sure what the print loop is doing, especially because it exits if the first 16 bytes are the same.

in general try not to print in calculation functions, instead use the return value to indicate a result.

16 is an arbitrary and very small block size to read. consider making it larger and storing it as a constant, rather than retyping it each time.

answered 1 hour ago

Peter

65812

2

Disagree with ""rb" is not a valid fopen flag." The C spec has "rb open binary file for reading".
â€“Â chux
1 hour ago

add a commentÂ |Â

up vote
0
down vote

In addition to what 200_success said:

"rb" is not a valid fopen flag. It most likely has the same effect as "r"

the return value of memcmp() is zero if the two are equal, so i am not sure what the print loop is doing, especially because it exits if the first 16 bytes are the same.

in general try not to print in calculation functions, instead use the return value to indicate a result.

16 is an arbitrary and very small block size to read. consider making it larger and storing it as a constant, rather than retyping it each time.

answered 1 hour ago

Peter

65812

2

Disagree with ""rb" is not a valid fopen flag." The C spec has "rb open binary file for reading".
â€“Â chux
1 hour ago

add a commentÂ |Â

up vote
0
down vote

In addition to what 200_success said:

"rb" is not a valid fopen flag. It most likely has the same effect as "r"

the return value of memcmp() is zero if the two are equal, so i am not sure what the print loop is doing, especially because it exits if the first 16 bytes are the same.

in general try not to print in calculation functions, instead use the return value to indicate a result.

16 is an arbitrary and very small block size to read. consider making it larger and storing it as a constant, rather than retyping it each time.

answered 1 hour ago

Peter

65812

In addition to what 200_success said:

"rb" is not a valid fopen flag. It most likely has the same effect as "r"

the return value of memcmp() is zero if the two are equal, so i am not sure what the print loop is doing, especially because it exits if the first 16 bytes are the same.

in general try not to print in calculation functions, instead use the return value to indicate a result.

16 is an arbitrary and very small block size to read. consider making it larger and storing it as a constant, rather than retyping it each time.

answered 1 hour ago

Peter

65812

answered 1 hour ago

Peter

65812

answered 1 hour ago

Peter

65812

answered 1 hour ago

Peter

65812

2

Disagree with ""rb" is not a valid fopen flag." The C spec has "rb open binary file for reading".
â€“Â chux
1 hour ago

add a commentÂ |Â

2

Disagree with ""rb" is not a valid fopen flag." The C spec has "rb open binary file for reading".
â€“Â chux
1 hour ago

Disagree with ""rb" is not a valid fopen flag." The C spec has "rb open binary file for reading".
â€“Â chux
1 hour ago

add a commentÂ |Â

ampat is a new contributor. Be nice, and check out our Code of Conduct.

draft saved

draft discarded

ampat is a new contributor. Be nice, and check out our Code of Conduct.

draft saved

draft discarded

Post as a guest

Name

Search This Blog

Iyfjky