回 帖 发 新 帖 刷新版面

主题:[讨论]C++大型文件处理

有3个文件A,B,C
A,B为2个基础文件,读取C文件的一行(1)去A,B,文件中查找相同的字符串,从文件A.B中提取找到那行的后面的信息,在C文件(1)后面输出提取出的那行.
如:A
aaa  123
bbb  456
ccc  789
文件B
qqq  565
ggg  321

文件C
aaa
bbb
ccc
qqq
ggg

结果文件
aaa  123
bbb  456
ccc  789
qqq  565
ggg  321
A文件有200M,B文件有400M,C文件有8M,我写的程序运行起来非常慢!也没想出什么好的算法,请高手指点

我写的程序:
#include<iostream>
#include<fstream>
#include<string>
#include<vector>
#include<iomanip>
using namespace std;

const long int SIZE=5000000;
const long int SIZE1=200000;
vector<string> Line;
char ListA[SIZE][40],HA[SIZE][40],ListB[SIZE][40],HB[SIZE][40],CompList[SIZE1][40];

int NA1,NA2,LineNumA,NB1,NB2,LineNumB,N1,LineNum;

int main()
{
    string line;
    ofstream outfile("finish");
    ifstream infile("A.out");

    while(getline(infile,line))
    {
        int tebnum=0,nA1=0,nA2=0;
        char listA[40]="",hA[40]="";
        for(int i=0; i<line.size(); i++)
        {
            if(line[i]=='\t')
            {
                tebnum++;
                continue;
            }

            if(tebnum==0)
            {
                char temp=line[i];
                listA[nA1++]=temp;
            }

            if(tebnum==1)
            {
                char temp=line[i];
                hA[nA2++]=temp;
            }
            if(tebnum>=2)
                break;
        }
        
        strcpy(*(ListA+NA1),listA);
        strcpy(*(HA+NA2),hA);
        
        NA1++;NA2++;LineNumA++;
    }

    infile.close();
    cout<<*ListA<<endl;
    cout<<*HA<<endl;
    cout<<"f"<<endl;
    ifstream infile1("B.out");

    while(getline(infile1,line))
    {
        int tebnum=0,nB1=0,nB2=0;
        char listB[40]="",hB[40]="";
        for(int i=0; i<line.size(); i++)
        {
            if(line[i]=='\t')
            {
                tebnum++;
                continue;
            }

            if(tebnum==0)
            {
                char temp=line[i];
                listB[nB1++]=temp;
            }

            if(tebnum==1)
            {
                char temp=line[i];
                hB[nB2++]=temp;
            }
            if(tebnum>=2)
                break;
        }
        strcpy(*(ListB+NB1),listB);
        strcpy(*(HB+NB2),hB);
        NB1++;NB2++;LineNumB++;
    }
    infile1.close();
    cout<<*ListB<<endl;
    cout<<*HB<<endl;
    cout<<"f"<<endl;
    ifstream infile2("CuA_CuB.m");

    while(getline(infile2,line))
    {
        Line.push_back(line);
        int tebnum=0,n=1;
        char list[40]="";
        list[0]='S';
        for(int i=0; i<line.size(); i++)
        {
            
            if(line[i]=='\t')
            {
                tebnum++;
                continue;
            }
            if(tebnum==5)
            {
                char temp=line[i];
                list[n++]=temp;
            }
        }
        strcpy(*(CompList+N1),list);
        
        N1++;LineNum++;
    }
    
    for(int cnt=0; cnt<LineNum; cnt+=2)
    {
        char compA[40]="",compB[40]="";
        //把A文件分成4个部分,同时查找与C文件相同的行
        for(int cnt1=0,cnt4=LineNumA/4,cnt6=LineNumA/2,cnt7=3*LineNumA/4; 
        cnt1<LineNumA/4,cnt4<LineNumA/2,cnt6<3*LineNumA/4,cnt7<LineNumA; cnt1++,cnt4++,cnt6++,cnt7++)
        {
            if (strcmp(*(CompList+cnt),*(ListA+cnt1))==0)
            {
                strcpy(compA,*(HA+cnt1));
                break;
            }
            if(strcmp(*(CompList+cnt),*(ListA+cnt4))==0)
            {
                strcpy(compA,*(HA+cnt4));
                break;
            }
            if(strcmp(*(CompList+cnt),*(ListA+cnt6))==0)
            {
                strcpy(compA,*(HA+cnt6));
                break;
            }
            if(strcmp(*(CompList+cnt),*(ListA+cnt7))==0)
            {
                strcpy(compA,*(HA+cnt7));
                break;
            }
        }
        //把B文件分成4个部分,同时查找与C文件相同的行
        for(int cnt2=0,cnt5=LineNumB/4,cnt8=LineNumB/2,cnt9=3*LineNumB/4;
        cnt2<LineNumB/4,cnt5<LineNumB/2,cnt8<3*LineNumB/4,cnt9<LineNumB; cnt2++,cnt5++,cnt8++,cnt9++)
        {
            if (strcmp(*(CompList+cnt),*(ListB+cnt2))==0)
            {
                strcpy(compB,*(HB+cnt2));
                break;
            }
            if(strcmp(*(CompList+cnt),*(ListB+cnt5))==0)
            {
                strcpy(compB,*(HB+cnt5));
                break;
            }
            if(strcmp(*(CompList+cnt),*(ListB+cnt8))==0)
            {
                strcpy(compB,*(HB+cnt8));
                break;
            }
            if(strcmp(*(CompList+cnt),*(ListB+cnt9))==0)
            {
                strcpy(compB,*(HB+cnt9));
                break;
            }
        }
        //下面为输出部分
        int numberA[40],numberB[40];
        cout<<compA<<"  "<<compB<<endl;
        outfile<<Line[cnt]<<'\t';
        for(int cnt3=0; cnt3<35;cnt3++)
        {
            numberA[cnt3]=100*((double)compA[cnt3]/'h');
            outfile<<setw(4)<<numberA[cnt3];
        }
        
        outfile<<endl;

        outfile<<Line[cnt+1]<<'\t';
        for(cnt3=0; cnt3<35; cnt3++)
        {
            numberB[cnt3]=100*((double)compB[cnt3]/'h');
            outfile<<setw(4)<<numberB[cnt3];
        }
        outfile<<endl;
    }
    return 0;

回复列表 (共1个回复)

沙发


如果用多线程查找A,B文件中匹配的字符串,该如何写?

我来回复

您尚未登录,请登录后再回复。点此登录或注册