utm sync
This commit is contained in:
@@ -56,9 +56,130 @@ interface SyncState {
|
|||||||
last_sync_id?: string;
|
last_sync_id?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 定义UTM参数接口
|
||||||
|
interface UtmParams {
|
||||||
|
utm_source: string;
|
||||||
|
utm_medium: string;
|
||||||
|
utm_campaign: string;
|
||||||
|
utm_term: string;
|
||||||
|
utm_content: string;
|
||||||
|
}
|
||||||
|
|
||||||
// 同步状态键名
|
// 同步状态键名
|
||||||
const SYNC_STATE_KEY = "f/shorturl_analytics/mongo_sync_state";
|
const SYNC_STATE_KEY = "f/shorturl_analytics/mongo_sync_state";
|
||||||
|
|
||||||
|
// 从URL中提取UTM参数的函数,增强版
|
||||||
|
function extractUtmParams(url: string, debug = false): UtmParams {
|
||||||
|
const defaultUtmParams: UtmParams = {
|
||||||
|
utm_source: "",
|
||||||
|
utm_medium: "",
|
||||||
|
utm_campaign: "",
|
||||||
|
utm_term: "",
|
||||||
|
utm_content: ""
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!url) return defaultUtmParams;
|
||||||
|
|
||||||
|
if (debug) {
|
||||||
|
console.log(`[UTM提取] 原始URL: ${url}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 准备一个解析后的参数对象
|
||||||
|
const params: UtmParams = { ...defaultUtmParams };
|
||||||
|
|
||||||
|
// 尝试多种方法提取UTM参数
|
||||||
|
|
||||||
|
// 方法1: 使用URL对象解析
|
||||||
|
try {
|
||||||
|
// 先处理URL,确保是完整的URL格式
|
||||||
|
let normalizedUrl = url;
|
||||||
|
if (!url.match(/^https?:\/\//i)) {
|
||||||
|
normalizedUrl = `https://example.com${url.startsWith('/') ? '' : '/'}${url}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
const urlObj = new URL(normalizedUrl);
|
||||||
|
|
||||||
|
// 读取URL参数
|
||||||
|
if (urlObj.searchParams.has('utm_source'))
|
||||||
|
params.utm_source = urlObj.searchParams.get('utm_source') || "";
|
||||||
|
if (urlObj.searchParams.has('utm_medium'))
|
||||||
|
params.utm_medium = urlObj.searchParams.get('utm_medium') || "";
|
||||||
|
if (urlObj.searchParams.has('utm_campaign'))
|
||||||
|
params.utm_campaign = urlObj.searchParams.get('utm_campaign') || "";
|
||||||
|
if (urlObj.searchParams.has('utm_term'))
|
||||||
|
params.utm_term = urlObj.searchParams.get('utm_term') || "";
|
||||||
|
if (urlObj.searchParams.has('utm_content'))
|
||||||
|
params.utm_content = urlObj.searchParams.get('utm_content') || "";
|
||||||
|
|
||||||
|
if (debug) {
|
||||||
|
console.log(`[UTM提取] URL对象解析结果: ${JSON.stringify(params)}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 如果至少找到一个UTM参数,则返回
|
||||||
|
if (params.utm_source || params.utm_medium || params.utm_campaign ||
|
||||||
|
params.utm_term || params.utm_content) {
|
||||||
|
return params;
|
||||||
|
}
|
||||||
|
} catch (_err) {
|
||||||
|
if (debug) {
|
||||||
|
console.log(`[UTM提取] URL对象解析失败,尝试正则表达式`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 方法2: 使用正则表达式提取参数
|
||||||
|
// 使用正则表达式(最安全的方法,适用于任何格式)
|
||||||
|
const sourceMatch = url.match(/[?&]utm_source=([^&#]+)/i);
|
||||||
|
if (sourceMatch && sourceMatch[1]) {
|
||||||
|
try {
|
||||||
|
params.utm_source = decodeURIComponent(sourceMatch[1]);
|
||||||
|
} catch (_) {
|
||||||
|
params.utm_source = sourceMatch[1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const mediumMatch = url.match(/[?&]utm_medium=([^&#]+)/i);
|
||||||
|
if (mediumMatch && mediumMatch[1]) {
|
||||||
|
try {
|
||||||
|
params.utm_medium = decodeURIComponent(mediumMatch[1]);
|
||||||
|
} catch (_) {
|
||||||
|
params.utm_medium = mediumMatch[1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const campaignMatch = url.match(/[?&]utm_campaign=([^&#]+)/i);
|
||||||
|
if (campaignMatch && campaignMatch[1]) {
|
||||||
|
try {
|
||||||
|
params.utm_campaign = decodeURIComponent(campaignMatch[1]);
|
||||||
|
} catch (_) {
|
||||||
|
params.utm_campaign = campaignMatch[1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const termMatch = url.match(/[?&]utm_term=([^&#]+)/i);
|
||||||
|
if (termMatch && termMatch[1]) {
|
||||||
|
try {
|
||||||
|
params.utm_term = decodeURIComponent(termMatch[1]);
|
||||||
|
} catch (_) {
|
||||||
|
params.utm_term = termMatch[1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const contentMatch = url.match(/[?&]utm_content=([^&#]+)/i);
|
||||||
|
if (contentMatch && contentMatch[1]) {
|
||||||
|
try {
|
||||||
|
params.utm_content = decodeURIComponent(contentMatch[1]);
|
||||||
|
} catch (_) {
|
||||||
|
params.utm_content = contentMatch[1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (debug) {
|
||||||
|
console.log(`[UTM提取] 正则表达式解析结果: ${JSON.stringify(params)}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return params;
|
||||||
|
}
|
||||||
|
|
||||||
export async function main(
|
export async function main(
|
||||||
batch_size = 1000,
|
batch_size = 1000,
|
||||||
max_records = 9999999,
|
max_records = 9999999,
|
||||||
@@ -66,7 +187,8 @@ export async function main(
|
|||||||
skip_clickhouse_check = false,
|
skip_clickhouse_check = false,
|
||||||
force_insert = false,
|
force_insert = false,
|
||||||
database_override = "shorturl_analytics", // 添加数据库名称参数,默认为shorturl_analytics
|
database_override = "shorturl_analytics", // 添加数据库名称参数,默认为shorturl_analytics
|
||||||
reset_sync_state = false // 添加参数用于重置同步状态
|
reset_sync_state = false, // 添加参数用于重置同步状态
|
||||||
|
debug_utm = false // 添加参数控制UTM调试日志输出
|
||||||
) {
|
) {
|
||||||
const logWithTimestamp = (message: string) => {
|
const logWithTimestamp = (message: string) => {
|
||||||
const now = new Date();
|
const now = new Date();
|
||||||
@@ -84,6 +206,9 @@ export async function main(
|
|||||||
if (reset_sync_state) {
|
if (reset_sync_state) {
|
||||||
logWithTimestamp("⚠️ 警告: 已启用重置同步状态,将从头开始同步数据");
|
logWithTimestamp("⚠️ 警告: 已启用重置同步状态,将从头开始同步数据");
|
||||||
}
|
}
|
||||||
|
if (debug_utm) {
|
||||||
|
logWithTimestamp("已启用UTM参数调试日志");
|
||||||
|
}
|
||||||
|
|
||||||
// 设置超时
|
// 设置超时
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
@@ -407,42 +532,64 @@ export async function main(
|
|||||||
}).toArray();
|
}).toArray();
|
||||||
|
|
||||||
// 创建映射用于快速查找 - 新增代码
|
// 创建映射用于快速查找 - 新增代码
|
||||||
const shortLinksMap = new Map(shortLinks.map(link => [link._id.toString(), link]));
|
const shortLinksMap = new Map(shortLinks.map((link: ShortRecord) => [link._id.toString(), link]));
|
||||||
logWithTimestamp(`获取到 ${shortLinks.length} 条短链接信息`);
|
logWithTimestamp(`获取到 ${shortLinks.length} 条短链接信息`);
|
||||||
|
|
||||||
// 准备ClickHouse插入数据
|
// 准备ClickHouse插入数据
|
||||||
const clickhouseData = newRecords.map(record => {
|
const clickhouseData = newRecords.map(record => {
|
||||||
const eventTime = new Date(record.createTime);
|
const eventTime = new Date(record.createTime);
|
||||||
|
|
||||||
// 获取对应的短链接信息 - 新增代码
|
// 获取对应的短链接信息 - 新增代码
|
||||||
const shortLink = shortLinksMap.get(record.slugId.toString());
|
const shortLink = shortLinksMap.get(record.slugId.toString()) as ShortRecord | undefined;
|
||||||
|
|
||||||
|
// 提取URL中的UTM参数 - 增加调试日志
|
||||||
|
if (debug_utm && record.url) {
|
||||||
|
logWithTimestamp(`======== UTM参数调试 ========`);
|
||||||
|
logWithTimestamp(`记录ID: ${record._id.toString()}`);
|
||||||
|
logWithTimestamp(`原始URL: ${record.url}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const utmParams = extractUtmParams(record.url || "", debug_utm);
|
||||||
|
|
||||||
|
if (debug_utm) {
|
||||||
|
logWithTimestamp(`提取的UTM参数: ${JSON.stringify(utmParams)}`);
|
||||||
|
logWithTimestamp(`===========================`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 保存提取的UTM参数和URL到event_attributes
|
||||||
|
const eventAttributes = {
|
||||||
|
mongo_id: record._id.toString(),
|
||||||
|
url: record.url || "",
|
||||||
|
...(record.url ? { raw_url: record.url } : {})
|
||||||
|
};
|
||||||
|
|
||||||
// 转换MongoDB记录为ClickHouse格式,匹配ClickHouse表结构
|
// 转换MongoDB记录为ClickHouse格式,匹配ClickHouse表结构
|
||||||
return {
|
return {
|
||||||
// UUID将由ClickHouse自动生成 (event_id)
|
// UUID将由ClickHouse自动生成 (event_id)
|
||||||
event_time: eventTime.toISOString().replace('T', ' ').replace('Z', ''),
|
event_time: eventTime.toISOString().replace('T', ' ').replace('Z', ''),
|
||||||
event_type: record.type === 1 ? "visit" : "custom",
|
event_type: record.type === 1 ? "visit" : "custom",
|
||||||
event_attributes: `{"mongo_id":"${record._id.toString()}"}`,
|
event_attributes: JSON.stringify(eventAttributes),
|
||||||
link_id: record.slugId.toString(),
|
link_id: record.slugId.toString(),
|
||||||
link_slug: shortLink?.slug || "", // 新增: 从short获取slug
|
link_slug: shortLink?.slug || "",
|
||||||
link_label: record.label || "",
|
link_label: record.label || "",
|
||||||
link_title: shortLink?.title || "", // 新增: 从short获取标题
|
link_title: shortLink?.title || "",
|
||||||
link_original_url: shortLink?.origin || "", // 新增: 从short获取原始URL
|
link_original_url: shortLink?.origin || "",
|
||||||
link_attributes: JSON.stringify({ domain: shortLink?.domain || null }), // 新增: 从short获取域名信息
|
link_attributes: JSON.stringify({ domain: shortLink?.domain || null }),
|
||||||
link_created_at: shortLink?.createTime
|
link_created_at: shortLink?.createTime
|
||||||
? new Date(shortLink.createTime).toISOString().replace('T', ' ').replace('Z', '')
|
? new Date(shortLink.createTime).toISOString().replace('T', ' ').replace('Z', '')
|
||||||
: eventTime.toISOString().replace('T', ' ').replace('Z', ''), // 新增: 使用真实的创建时间
|
: eventTime.toISOString().replace('T', ' ').replace('Z', ''),
|
||||||
link_expires_at: shortLink?.expiresAt
|
link_expires_at: shortLink?.expiresAt
|
||||||
? new Date(shortLink.expiresAt).toISOString().replace('T', ' ').replace('Z', '')
|
? new Date(shortLink.expiresAt).toISOString().replace('T', ' ').replace('Z', '')
|
||||||
: null, // 新增: 使用真实的过期时间
|
: null,
|
||||||
link_tags: shortLink?.tags ? JSON.stringify(shortLink.tags) : "[]", // 新增: 从short获取标签
|
link_tags: shortLink?.tags ? JSON.stringify(shortLink.tags) : "[]",
|
||||||
user_id: shortLink?.user || "", // 新增: 从short获取用户ID
|
user_id: shortLink?.user || "",
|
||||||
user_name: "",
|
user_name: "",
|
||||||
user_email: "",
|
user_email: "",
|
||||||
user_attributes: "{}",
|
user_attributes: "{}",
|
||||||
team_id: shortLink?.teamId || "", // 新增: 从short获取团队ID
|
team_id: shortLink?.teamId || "",
|
||||||
team_name: "",
|
team_name: "",
|
||||||
team_attributes: "{}",
|
team_attributes: "{}",
|
||||||
project_id: shortLink?.projectId || "", // 新增: 从short获取项目ID
|
project_id: shortLink?.projectId || "",
|
||||||
project_name: "",
|
project_name: "",
|
||||||
project_attributes: "{}",
|
project_attributes: "{}",
|
||||||
qr_code_id: "",
|
qr_code_id: "",
|
||||||
@@ -458,11 +605,11 @@ export async function main(
|
|||||||
os: record.platformOS || "",
|
os: record.platformOS || "",
|
||||||
user_agent: record.browser + " " + record.browserVersion,
|
user_agent: record.browser + " " + record.browserVersion,
|
||||||
referrer: record.url || "",
|
referrer: record.url || "",
|
||||||
utm_source: "",
|
utm_source: utmParams.utm_source,
|
||||||
utm_medium: "",
|
utm_medium: utmParams.utm_medium,
|
||||||
utm_campaign: "",
|
utm_campaign: utmParams.utm_campaign,
|
||||||
utm_term: "",
|
utm_term: utmParams.utm_term,
|
||||||
utm_content: "",
|
utm_content: utmParams.utm_content,
|
||||||
time_spent_sec: 0,
|
time_spent_sec: 0,
|
||||||
is_bounce: true,
|
is_bounce: true,
|
||||||
is_qr_scan: false,
|
is_qr_scan: false,
|
||||||
@@ -582,6 +729,17 @@ export async function main(
|
|||||||
if (records.length > 1) {
|
if (records.length > 1) {
|
||||||
logWithTimestamp(`批次 ${page+1} 最后一条记录: ID=${records[records.length-1]._id}, 时间=${new Date(records[records.length-1].createTime).toISOString()}`);
|
logWithTimestamp(`批次 ${page+1} 最后一条记录: ID=${records[records.length-1]._id}, 时间=${new Date(records[records.length-1].createTime).toISOString()}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 如果开启了调试,输出一些URL样本
|
||||||
|
if (debug_utm) {
|
||||||
|
const sampleSize = Math.min(5, records.length);
|
||||||
|
logWithTimestamp(`URL样本 (前${sampleSize}条):`);
|
||||||
|
for (let i = 0; i < sampleSize; i++) {
|
||||||
|
if (records[i].url) {
|
||||||
|
logWithTimestamp(`样本 ${i+1}: ${records[i].url}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const batchSize = await processRecords(records);
|
const batchSize = await processRecords(records);
|
||||||
|
|||||||
Reference in New Issue
Block a user