libmobi
C library for handling MOBI format ebook documents
|
Functions for parsing rawml markup. More...
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "parse_rawml.h"
#include "util.h"
#include "opf.h"
#include "structure.h"
#include "index.h"
#include "debug.h"
Macros | |
#define | _GNU_SOURCE 1 |
#define | __USE_BSD /* for strdup on linux/glibc */ |
Functions | |
size_t | mobi_get_rawlink_location (const MOBIRawml *rawml, const uint32_t pos_fid, const uint32_t pos_off) |
Convert kindle:pos:fid:x:off:y to offset in rawml raw text file. More... | |
MOBI_RET | mobi_search_links_kf7 (MOBIResult *result, const unsigned char *data_start, const unsigned char *data_end) |
Find first occurence of attribute to be replaced in KF7 html. More... | |
MOBI_RET | mobi_find_attrvalue (MOBIResult *result, const unsigned char *data_start, const unsigned char *data_end, const MOBIFiletype type, const char *needle) |
Find first occurence of markup attribute with given value. More... | |
MOBI_RET | mobi_find_attrname (MOBIResult *result, const unsigned char *data_start, const unsigned char *data_end, const char *attrname) |
Find first occurence of markup attribute with given name. More... | |
MOBI_RET | mobi_search_links_kf8 (MOBIResult *result, const unsigned char *data_start, const unsigned char *data_end, const MOBIFiletype type) |
Find first occurence of attribute part to be replaced in KF8 html/css. More... | |
size_t | mobi_get_attribute_value (char *value, const unsigned char *data, const size_t size, const char *attribute, bool only_quoted) |
Get value and offset of the first found attribute with given name. More... | |
size_t | mobi_get_aid_offset (const MOBIPart *html, const char *aid) |
Get offset of the given value of an "aid" attribute in a given part. More... | |
MOBI_RET | mobi_get_offset_by_posoff (uint32_t *file_number, size_t *offset, const MOBIRawml *rawml, const size_t pos_fid, const size_t pos_off) |
Convert kindle:pos:fid:x:off:y to skeleton part number and offset from the beginning of the part. More... | |
MOBI_RET | mobi_get_aid_by_offset (char *aid, const MOBIPart *html, const size_t offset) |
Get value of the closest "aid" attribute following given offset in a given part. More... | |
MOBI_RET | mobi_get_id_by_offset (char *id, const MOBIPart *html, const size_t offset, MOBIAttrType *pref_attr) |
Get value of the closest "id" or "name" attribute following given offset in a given part. More... | |
MOBI_RET | mobi_get_aid_by_posoff (uint32_t *file_number, char *aid, const MOBIRawml *rawml, const size_t pos_fid, const size_t pos_off) |
Convert kindle:pos:fid:x:off:y to html file number and closest "aid" attribute following the position. More... | |
MOBI_RET | mobi_get_id_by_posoff (uint32_t *file_number, char *id, const MOBIRawml *rawml, const size_t pos_fid, const size_t pos_off, MOBIAttrType *pref_attr) |
Convert kindle:pos:fid:x:off:y to html file number and closest "id" attribute following the position. More... | |
MOBI_RET | mobi_reconstruct_resources (const MOBIData *m, MOBIRawml *rawml) |
Parse resource records (images, fonts etc), determine their type, link to rawml. More... | |
MOBI_RET | mobi_process_replica (unsigned char *pdf, const char *text, size_t *length) |
Parse Replica Print ebook (azw4). Extract pdf. More... | |
MOBI_RET | mobi_reconstruct_flow (MOBIRawml *rawml, const char *text, const size_t length) |
Parse raw text into flow parts. More... | |
MOBI_RET | mobi_reconstruct_parts (MOBIRawml *rawml) |
Parse raw html into html parts. Use index entries if present to parse file. More... | |
MOBI_RET | mobi_get_filepos_array (MOBIArray *links, const MOBIPart *part) |
Scan html part and build array of filepos link target offsets. More... | |
MOBI_RET | mobi_get_ncx_filepos_array (MOBIArray *links, const MOBIRawml *rawml) |
Scan ncx part and build array of filepos link target offsets. More... | |
MOBI_RET | mobi_posfid_to_link (char *link, const MOBIRawml *rawml, const char *value, MOBIAttrType *pref_attr) |
Replace kindle:pos link with html href. More... | |
MOBI_RET | mobi_flow_to_link (char *link, const MOBIRawml *rawml, const char *value) |
Replace kindle:flow link with html href. More... | |
MOBI_RET | mobi_embed_to_link (char *link, const MOBIRawml *rawml, const char *value) |
Replace kindle:embed link with html href. More... | |
MOBI_RET | mobi_reconstruct_links_kf8 (const MOBIRawml *rawml) |
Replace offset-links with html-links in KF8 markup. More... | |
MOBI_RET | mobi_reconstruct_infl (char *outstring, const MOBIIndx *infl, const MOBIIndexEntry *orth_entry) |
Get infl index markup for given orth entry. More... | |
MOBI_RET | mobi_reconstruct_infl_v1 (char *outstring, MOBITrie *const infl_tree, const MOBIIndexEntry *orth_entry) |
Get infl index markup for given orth entry. More... | |
MOBI_RET | mobi_reconstruct_orth (const MOBIRawml *rawml, MOBIFragment *first, size_t *new_size) |
Insert orth index markup to linked list of fragments. More... | |
MOBI_RET | mobi_reconstruct_links_kf7 (const MOBIRawml *rawml) |
Replace offset-links with html-links in KF7 markup. Also reconstruct dictionary markup if present. More... | |
MOBI_RET | mobi_reconstruct_links (const MOBIRawml *rawml) |
Replace offset-links with html-links. More... | |
MOBI_RET | mobi_iterate_txtparts (MOBIRawml *rawml, MOBI_RET(*cb)(MOBIPart *)) |
Call callback function for each text record. More... | |
MOBI_RET | mobi_markup_to_utf8 (MOBIPart *part) |
Convert MOBIPart part data to utf8. More... | |
MOBI_RET | mobi_strip_mobitags (MOBIPart *part) |
Strip unneeded tags from html. Currently only <aid> More... | |
MOBI_RET | mobi_parse_rawml (MOBIRawml *rawml, const MOBIData *m) |
Parse raw records into html flow parts, markup parts, resources and indices. More... | |
MOBI_RET | mobi_parse_rawml_opt (MOBIRawml *rawml, const MOBIData *m, bool parse_toc, bool parse_dict, bool reconstruct) |
Parse raw records into html flow parts, markup parts, resources and indices. Individual stages of the parsing may be turned on/off. More... | |
Functions for parsing rawml markup.
Copyright (c) 2020 Bartek Fabiszewski http://www.fabiszewski.net
This file is part of libmobi. Licensed under LGPL, either version 3, or any later. See http://www.gnu.org/licenses/
Replace kindle:embed link with html href.
[in,out] | link | Memory area which will be filled with "resource00000.ext", including quotation marks |
[in] | rawml | Structure rawml |
[in] | value | String kindle:embed:0000?mime=type, with optional quotation marks |
MOBI_RET mobi_find_attrname | ( | MOBIResult * | result, |
const unsigned char * | data_start, | ||
const unsigned char * | data_end, | ||
const char * | attrname | ||
) |
Find first occurence of markup attribute with given name.
[in,out] | result | MOBIResult structure will be filled with found data |
[in] | data_start | Beginning of the memory area to search in |
[in] | data_end | End of the memory area to search in |
[in] | attrname | String to find (len < MOBI_ATTRNAME_MAXSIZE) |
MOBI_RET mobi_find_attrvalue | ( | MOBIResult * | result, |
const unsigned char * | data_start, | ||
const unsigned char * | data_end, | ||
const MOBIFiletype | type, | ||
const char * | needle | ||
) |
Find first occurence of markup attribute with given value.
[in,out] | result | MOBIResult structure will be filled with found data |
[in] | data_start | Beginning of the memory area to search in |
[in] | data_end | End of the memory area to search in |
[in] | type | Type of data (T_HTML or T_CSS) |
[in] | needle | String to find (len <= MOBI_ATTRNAME_MAXSIZE) |
Replace kindle:flow link with html href.
[in,out] | link | Memory area which will be filled with "part00000.ext", including quotation marks |
[in] | rawml | Structure rawml |
[in] | value | String kindle:flow:0000?mime=type, without quotation marks |
Get value of the closest "aid" attribute following given offset in a given part.
[in,out] | aid | String value of "aid" attribute |
[in] | html | MOBIPart html part |
[in] | offset | Offset from the beginning of the part data |
MOBI_RET mobi_get_aid_by_posoff | ( | uint32_t * | file_number, |
char * | aid, | ||
const MOBIRawml * | rawml, | ||
const size_t | pos_fid, | ||
const size_t | pos_off | ||
) |
Convert kindle:pos:fid:x:off:y to html file number and closest "aid" attribute following the position.
[in,out] | file_number | Will be set to file number value |
[in,out] | aid | String value of "aid" attribute |
[in] | rawml | MOBIRawml parsed records structure |
[in] | pos_fid | X value of pos:fid:x |
[in] | pos_off | Y value of off:y |
size_t mobi_get_aid_offset | ( | const MOBIPart * | html, |
const char * | aid | ||
) |
Get offset of the given value of an "aid" attribute in a given part.
[in] | aid | String value of "aid" attribute |
[in] | html | MOBIPart html part |
size_t mobi_get_attribute_value | ( | char * | value, |
const unsigned char * | data, | ||
const size_t | size, | ||
const char * | attribute, | ||
bool | only_quoted | ||
) |
Get value and offset of the first found attribute with given name.
[in,out] | value | String value of the attribute, will be filled by the function, zero length if not found |
[in] | data | Data to search in |
[in] | size | Data size |
[in] | attribute | Attribute name |
[in] | only_quoted | Require the value to be quoted if true, allow no quotes (eg. filepos=00001) if false |
MOBI_RET mobi_get_id_by_offset | ( | char * | id, |
const MOBIPart * | html, | ||
const size_t | offset, | ||
MOBIAttrType * | pref_attr | ||
) |
Get value of the closest "id" or "name" attribute following given offset in a given part.
[in,out] | id | String value of found attribute |
[in] | html | MOBIPart html part |
[in] | offset | Offset from the beginning of the part data |
[in,out] | pref_attr | Preferred attribute to link to (id or name) |
MOBI_RET mobi_get_id_by_posoff | ( | uint32_t * | file_number, |
char * | id, | ||
const MOBIRawml * | rawml, | ||
const size_t | pos_fid, | ||
const size_t | pos_off, | ||
MOBIAttrType * | pref_attr | ||
) |
Convert kindle:pos:fid:x:off:y to html file number and closest "id" attribute following the position.
[in,out] | file_number | Will be set to file number value |
[in,out] | id | String value of "id" attribute |
[in] | rawml | MOBIRawml parsed records structure |
[in] | pos_fid | X value of pos:fid:x |
[in] | pos_off | Y value of off:y |
[in,out] | pref_attr | Attribute to link to |
MOBI_RET mobi_get_offset_by_posoff | ( | uint32_t * | file_number, |
size_t * | offset, | ||
const MOBIRawml * | rawml, | ||
const size_t | pos_fid, | ||
const size_t | pos_off | ||
) |
Convert kindle:pos:fid:x:off:y to skeleton part number and offset from the beginning of the part.
[in,out] | file_number | Will be set to file number value |
[in,out] | offset | Offset from the beginning of the skeleton part |
[in] | rawml | MOBIRawml parsed records structure |
[in] | pos_fid | X value of pos:fid:x |
[in] | pos_off | X value of pos:off:x |
size_t mobi_get_rawlink_location | ( | const MOBIRawml * | rawml, |
const uint32_t | pos_fid, | ||
const uint32_t | pos_off | ||
) |
Convert kindle:pos:fid:x:off:y to offset in rawml raw text file.
[in] | rawml | MOBIRawml parsed records structure |
[in] | pos_fid | X value of pos:fid:x |
[in] | pos_off | Y value of off:y |
Call callback function for each text record.
[in,out] | rawml | Structure rawml will be filled with reconstructed parts and resources |
[in,out] | cb | Callback function |
MOBI_RET mobi_posfid_to_link | ( | char * | link, |
const MOBIRawml * | rawml, | ||
const char * | value, | ||
MOBIAttrType * | pref_attr | ||
) |
Replace kindle:pos link with html href.
[in,out] | link | Memory area which will be filled with "part00000.html#customid", including quotation marks |
[in] | rawml | Structure rawml |
[in] | value | String kindle:pos:fid:0000:off:0000000000, without quotation marks |
[in,out] | pref_attr | Preferred attribute to link to (id or name) |
MOBI_RET mobi_process_replica | ( | unsigned char * | pdf, |
const char * | text, | ||
size_t * | length | ||
) |
Parse Replica Print ebook (azw4). Extract pdf.
[in,out] | Memory area will be filled with extracted pdf data | |
[in] | text | Raw decompressed text to be parsed |
[in,out] | length | Text length. Will be updated with pdf_length on return |
Parse raw text into flow parts.
[in,out] | rawml | Structure rawml->flow will be filled with parsed flow text parts |
[in] | text | Raw decompressed text to be parsed |
[in] | length | Text length |
MOBI_RET mobi_reconstruct_infl | ( | char * | outstring, |
const MOBIIndx * | infl, | ||
const MOBIIndexEntry * | orth_entry | ||
) |
Get infl index markup for given orth entry.
[in,out] | outstring | Reconstructed tag <idx:infl> |
[in] | infl | MOBIIndx structure with parsed infl index |
[in] | orth_entry | Orth index entry |
MOBI_RET mobi_reconstruct_infl_v1 | ( | char * | outstring, |
MOBITrie *const | infl_tree, | ||
const MOBIIndexEntry * | orth_entry | ||
) |
Get infl index markup for given orth entry.
This function is inflections scheme used in older mobipocket dictionaries
[in,out] | outstring | Reconstructed tag <idx:infl> |
[in] | infl_tree | MOBITrie structure with inflection rules |
[in] | orth_entry | Orth index entry |
Replace offset-links with html-links.
[in,out] | rawml | Structure rawml will be filled with reconstructed parts and resources |
Replace offset-links with html-links in KF7 markup. Also reconstruct dictionary markup if present.
[in,out] | rawml | Structure rawml will be filled with reconstructed parts and resources |
Replace offset-links with html-links in KF8 markup.
[in,out] | rawml | Structure rawml will be filled with reconstructed parts and resources |
MOBI_RET mobi_reconstruct_orth | ( | const MOBIRawml * | rawml, |
MOBIFragment * | first, | ||
size_t * | new_size | ||
) |
Insert orth index markup to linked list of fragments.
[in] | rawml | Structure rawml contains orth index data |
[in,out] | first | First element of the linked list |
[in,out] | new_size | Counter to be updated with inserted fragments size |
Parse raw html into html parts. Use index entries if present to parse file.
[in,out] | rawml | Structure rawml->markup will be filled with reconstructed html parts |
Parse resource records (images, fonts etc), determine their type, link to rawml.
[in] | m | MOBIData structure with loaded Record(s) 0 headers |
[in,out] | rawml | Structure rawml->resources will be filled with parsed resources metadata and linked records data |
MOBI_RET mobi_search_links_kf7 | ( | MOBIResult * | result, |
const unsigned char * | data_start, | ||
const unsigned char * | data_end | ||
) |
Find first occurence of attribute to be replaced in KF7 html.
It searches for filepos and recindex attributes
[in,out] | result | MOBIResult structure will be filled with found data |
[in] | data_start | Beginning of the memory area to search in |
[in] | data_end | End of the memory area to search in |
MOBI_RET mobi_search_links_kf8 | ( | MOBIResult * | result, |
const unsigned char * | data_start, | ||
const unsigned char * | data_end, | ||
const MOBIFiletype | type | ||
) |
Find first occurence of attribute part to be replaced in KF8 html/css.
It searches for "kindle:" value in attributes
[in,out] | result | MOBIResult structure will be filled with found data |
[in] | data_start | Beginning of the memory area to search in |
[in] | data_end | End of the memory area to search in |
[in] | type | Type of data (T_HTML or T_CSS) |