Url Parsing in C++
#include <iostream>
#include <string>
#include <map>
#include <cstdlib>
#include <regex>
#include <stdexcept>
// A simple URL parsing class
class URL {
public:
URL(const std::string& url_string) {
parse(url_string);
}
std::string scheme;
std::string user;
std::string password;
std::string host;
std::string port;
std::string path;
std::string query;
std::string fragment;
private:
void parse(const std::string& url_string) {
std::regex url_regex(
R"(^(([^:/?#]+):)?(//((([^:@]*):?([^:@]*))?@)?([^:/?#]*):?(\d*))?((/[^?#]*)?(\?[^#]*)?(#.*)?))",
std::regex::extended
);
std::smatch match;
if (std::regex_match(url_string, match, url_regex)) {
scheme = match[2];
user = match[6];
password = match[7];
host = match[8];
port = match[9];
path = match[11];
query = match[12];
fragment = match[13];
} else {
throw std::runtime_error("Invalid URL");
}
}
};
// Function to parse query string into a map
std::map<std::string, std::string> parse_query(const std::string& query) {
std::map<std::string, std::string> result;
std::regex param_regex("([^&=]+)=([^&]*)");
auto begin = std::sregex_iterator(query.begin() + 1, query.end(), param_regex);
auto end = std::sregex_iterator();
for (std::sregex_iterator i = begin; i != end; ++i) {
std::smatch match = *i;
result[match[1]] = match[2];
}
return result;
}
int main() {
// We'll parse this example URL, which includes a
// scheme, authentication info, host, port, path,
// query params, and query fragment.
std::string s = "postgres://user:pass@host.com:5432/path?k=v#f";
// Parse the URL and ensure there are no errors.
URL u(s);
// Accessing the scheme is straightforward.
std::cout << u.scheme << std::endl;
// User contains all authentication info
std::cout << u.user << ":" << u.password << std::endl;
std::cout << u.user << std::endl;
std::cout << u.password << std::endl;
// The Host contains both the hostname and the port,
// if present.
std::cout << u.host << ":" << u.port << std::endl;
std::cout << u.host << std::endl;
std::cout << u.port << std::endl;
// Here we extract the path and the fragment after
// the #.
std::cout << u.path << std::endl;
std::cout << u.fragment << std::endl;
// To get query params in a string of k=v format,
// use query. You can also parse query params
// into a map.
std::cout << u.query << std::endl;
auto m = parse_query(u.query);
for (const auto& pair : m) {
std::cout << pair.first << ": " << pair.second << std::endl;
}
return 0;
}
This C++ program demonstrates URL parsing using a custom URL
class and regular expressions. Here’s a breakdown of what the program does:
We define a
URL
class that parses a URL string into its components.The
parse_query
function is used to parse the query string into a map.In the
main
function, we create a sample URL string.We create a
URL
object by parsing the string.We then print out various components of the URL:
- The scheme
- User and password information
- Host and port
- Path and fragment
- Raw query string
Finally, we parse the query string into a map and print out the key-value pairs.
To compile and run this program:
$ g++ -std=c++11 url_parsing.cpp -o url_parsing
$ ./url_parsing
This will output the different pieces of the URL that we extracted.
Note that C++ doesn’t have built-in URL parsing libraries like some other languages, so we’ve implemented a basic parser using regular expressions. For production use, you might want to use a more robust third-party library for URL parsing.