[C++] tesseract 5.3.1 GetUTF8Text() crash(비정상 종료 버그)

카테고리 없음
[C++] tesseract 5.3.1 GetUTF8Text() crash(비정상 종료 버그)

뿌셩 2023. 6. 15. 00:15
약 10만개의 이미지 파일을 대상으로 텍스트변환을 하는 도중 GetUTF8Text() 함수에서 프로그램이
죽는 문제가 발생하였다. 처음에는 8개의 스레드로 텍스트변환 작업을 하다보니 랜덤한 이미지에서
발생하는 버그인줄 알았다. 그러나 프로그램 crash 발생 시간이 동작 후 약 19시간만에 발생한다는
특징을 발견하였고 결국 특정이미지에서 발생한다는 것을 알게되었다.

아쉽게도 해당 이미지는 내용상 공개가 불가하다.
다만 tesseract의 crash를 피하는 방법을 작성하고자 한다.(근본적인 해결방법은 아니다)

주석으로 modified 라고 처리된 부분이 수정해야할 곳이다. important라고 써넣은곳은 확실히 그 부분에서
오류가 발생한 곳이다.
1. elist.cpp 파일을 아래와 같이 수정
 
//elist.cpp
ELIST_LINK* ELIST_ITERATOR::forward() {
#ifndef NDEBUG
    if (!list)
        NO_LIST.error("ELIST_ITERATOR::forward", ABORT);
#endif
    if (list == nullptr)//modified
        return nullptr;
    if (list->empty()) {
        return nullptr;
    }
    if (current) { // not removed so
        // set previous
        prev = current;
        started_cycling = true;
        // In case next is deleted by another iterator, get next from current.
        current = current->next;
    }
    else {
        if (ex_current_was_cycle_pt) {
            cycle_pt = next;
        }
        current = next;
    }
#ifndef NDEBUG
    if (!current)
        NULL_DATA.error("ELIST_ITERATOR::forward", ABORT);
#endif
    if (current)//modified(important)
        next = current->next;
#ifndef NDEBUG
    if (!next) {
        NULL_NEXT.error("ELIST_ITERATOR::forward", ABORT,
            "This is: %p  Current is: %p",
            static_cast<void*>(this),
            static_cast<void*>(current));
    }
#endif
    return current;
}
 
2. pageres.cpp 파일을 아래와 같이 수정
 
//pageres.cpp
//원래 void 였으나 bool로 변경한다(header 파일도 bool 로 변경한다)
bool PAGE_RES_IT::ReplaceCurrentWord(
    tesseract::PointerVector<WERD_RES>* words) {
    if (words->empty()) {
        DeleteCurrentWord();
        return true;
    }
    WERD_RES* input_word = word();
    if (!input_word) //modified
        return false;
    // Set the BOL/EOL flags on the words from the input word.
    if (input_word->word->flag(W_BOL)) {
        (*words)[0]->word->set_flag(W_BOL, true);
    }
    else {
        (*words)[0]->word->set_blanks(input_word->word->space());
    }
    words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
    // Move the blobs from the input word to the new set of words.
    // If the input word_res is a combination, then the replacements will also be
    // combinations, and will own their own words. If the input word_res is not a
    // combination, then the final replacements will not be either, (although it
    // is allowed for the input words to be combinations) and their words
    // will get put on the row list. This maintains the ownership rules.
    WERD_IT w_it(row()->row->word_list());
    if (!input_word->combination) {
        for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
            WERD* word = w_it.data();
            if (word == input_word->word) {
                break;
            }
        }
        // w_it is now set to the input_word,s word.
        ASSERT_HOST(!w_it.cycled_list());
        if (w_it.cycled_list()) //modified
            return false;
    }
    // Insert into the appropriate place in the ROW_RES.
    WERD_RES_IT wr_it(&row()->word_res_list);
    for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
        WERD_RES* word = wr_it.data();
        if (word == input_word) {
            break;
        }
    }
    ASSERT_HOST(!wr_it.cycled_list());
    if (wr_it.cycled_list()) //modified
        return false;
    // Since we only have an estimate of the bounds between blobs, use the blob
    // x-middle as the determiner of where to put the blobs
    C_BLOB_IT src_b_it(input_word->word->cblob_list());
    src_b_it.sort(&C_BLOB::SortByXMiddle);
    C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
    rej_b_it.sort(&C_BLOB::SortByXMiddle);
    TBOX clip_box;
    bool bSuccess = true; //modified
    for (size_t w = 0; w < words->size(); ++w) {
        if (!bSuccess) //modified
            break;
        WERD_RES* word_w = (*words)[w];
        clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
        // Compute blob boundaries.
        std::vector<int> blob_ends;
        C_BLOB_LIST* next_word_blobs =
            w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
        ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
        // Remove the fake blobs on the current word, but keep safe for back-up if
        // no blob can be found.
        C_BLOB_LIST fake_blobs;
        C_BLOB_IT fake_b_it(&fake_blobs);
        fake_b_it.add_list_after(word_w->word->cblob_list());
        fake_b_it.move_to_first();
        word_w->word->cblob_list()->clear();
        C_BLOB_IT dest_it(word_w->word->cblob_list());
        // Build the box word as we move the blobs.
        auto* box_word = new tesseract::BoxWord;
        for (size_t i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {
            if (!src_b_it.empty() && src_b_it.data() == nullptr) //modified(important)
            {
                bSuccess = false;
                break;
            }
            if (!rej_b_it.empty() && rej_b_it.data() == nullptr) //modified(important)
            {
                bSuccess = false;
                break;
            }
            int end_x = blob_ends[i];
            TBOX blob_box;
            // Add the blobs up to end_x.
            while (!src_b_it.empty() &&
                src_b_it.data() && //modified
                src_b_it.data()->bounding_box().x_middle() < end_x) {
                blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);
                src_b_it.forward();
            }
            while (!rej_b_it.empty() &&
                rej_b_it.data() && //modified
                rej_b_it.data()->bounding_box().x_middle() < end_x) {
                blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);
                rej_b_it.forward();
            }
            if (blob_box.null_box()) {
                // Use the original box as a back-up.
                blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);
            }
            box_word->InsertBox(i, blob_box);
        }
        delete word_w->box_word;
        word_w->box_word = box_word;
        if (!input_word->combination) {
            // Insert word_w->word into the ROW. It doesn''t own its word, so the
            // ROW needs to own it.
            w_it.add_before_stay_put(word_w->word);
            word_w->combination = false;
        }
        (*words)[w] = nullptr; // We are taking ownership.
        wr_it.add_before_stay_put(word_w);
    }
    // We have taken ownership of the words.
    words->clear();
    // Delete the current word, which has been replaced. We could just call
    // DeleteCurrentWord, but that would iterate both lists again, and we know
    // we are already in the right place.
    if (!input_word->combination) {
        delete w_it.extract();
    }
    delete wr_it.extract();
    ResetWordIterator();
    return bSuccess; //modified
}
 
3. control.cpp 파일을 아래와 같이 수정
 
//control.cpp
//당초 void 리턴형을 bool로 변경한다(header 파일의 return type도 bool로 변경해야 한다)
bool Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it, WordData* word_data) {
#ifdef DISABLED_LEGACY_ENGINE
    WordRecognizer recognizer = &Tesseract::classify_word_pass1;
#else
    WordRecognizer recognizer =
        pass_n == 1 ? &Tesseract::classify_word_pass1 : &Tesseract::classify_word_pass2;
#endif // def DISABLED_LEGACY_ENGINE
 
    // Best result so far.
    PointerVector<WERD_RES> best_words;
    // Points to the best result. May be word or in lang_words.
    const WERD_RES* word = word_data->word;
    clock_t start_t = clock();
    const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
    if (debug) {
        tprintf("%s word with lang %s at:", word->done ? "Already done" : "Processing",
            most_recently_used_->lang.c_str());
        word->word->bounding_box().print();
    }
    if (word->done) {
        // If done on pass1, leave it as-is.
        if (!word->tess_failed) {
            most_recently_used_ = word->tesseract;
        }
        return true;//modified
    }
    auto sub = sub_langs_.size();
    if (most_recently_used_ != this) {
        // Get the index of the most_recently_used_.
        for (sub = 0; sub < sub_langs_.size() && most_recently_used_ != sub_langs_[sub]; ++sub) {
        }
    }
    most_recently_used_->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[sub],
        &best_words);
    Tesseract* best_lang_tess = most_recently_used_;
    if (!WordsAcceptable(best_words)) {
        // Try all the other languages to see if they are any better.
        if (most_recently_used_ != this &&
            this->RetryWithLanguage(*word_data, recognizer, debug,
                &word_data->lang_words[sub_langs_.size()], &best_words) > 0) {
            best_lang_tess = this;
        }
        for (unsigned i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size(); ++i) {
            if (most_recently_used_ != sub_langs_[i] &&
                sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[i],
                    &best_words) > 0) {
                best_lang_tess = sub_langs_[i];
            }
        }
    }
    most_recently_used_ = best_lang_tess;
    if (!best_words.empty()) {
        if (best_words.size() == 1 && !best_words[0]->combination) {
            // Move the best single result to the main word.
            word_data->word->ConsumeWordResults(best_words[0]);
        }
        else {
            // Words came from LSTM, and must be moved to the PAGE_RES properly.
            word_data->word = best_words.back();
            if (!pr_it->ReplaceCurrentWord(&best_words)) //modified(important)
                return false;
        }
        ASSERT_HOST(word_data->word->box_word != nullptr);
        if (word_data->word->box_word == nullptr) //modified
            return false;
    }
    else {
        tprintf("no best words!!\n");
    }
    clock_t ocr_t = clock();
    if (tessedit_timing_debug) {
        tprintf("%s (ocr took %.2f sec)\n", word_data->word->best_choice->unichar_string().c_str(),
            static_cast<double>(ocr_t - start_t) / CLOCKS_PER_SEC);
    }
 
    return true; //modified
}
 
 
4. control.cpp 파일내의 classify_word_and_language() 함수 호출하는 부분을 기존 소스를 참고하여
에러시 리턴될 수 있도록  전부 수정한다. param 부분은 기존 소스를 그대로 인용한다.
 
if (!classify_word_and_language(param1, param2, param3))
      return false;
 
또는
 
if (!classify_word_and_language(param1, param2, param3))
      return 0.0f;
행운을 바란다.