rw create table when start app
This commit is contained in:
@@ -1,6 +1,12 @@
|
||||
-- 删除旧表
|
||||
DROP TABLE IF EXISTS events;
|
||||
|
||||
DROP TABLE IF EXISTS follower_events;
|
||||
|
||||
DROP TABLE IF EXISTS like_events;
|
||||
|
||||
DROP TABLE IF EXISTS view_events;
|
||||
|
||||
DROP TABLE IF EXISTS mv_kol_performance;
|
||||
|
||||
DROP TABLE IF EXISTS mv_platform_distribution;
|
||||
|
||||
12
backend/db/sql/clickhouse/sync_logs.sql
Normal file
12
backend/db/sql/clickhouse/sync_logs.sql
Normal file
@@ -0,0 +1,12 @@
|
||||
CREATE TABLE IF NOT EXISTS promote.sync_logs (
|
||||
timestamp DateTime,
|
||||
duration_ms UInt64,
|
||||
posts_synced UInt32,
|
||||
comments_synced UInt32,
|
||||
influencer_changes_synced UInt32,
|
||||
projects_synced UInt32,
|
||||
success UInt8,
|
||||
error_messages String
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY
|
||||
(timestamp)
|
||||
@@ -12,7 +12,8 @@
|
||||
"test:analytics": "tsx scripts/analytics-test.ts",
|
||||
"ch": "bash db/sql/clickhouse/ch-query.sh",
|
||||
"check-schema": "node db/db-inspector/run-all.js",
|
||||
"pg": "node db/sql/postgres/pg-query.js"
|
||||
"pg": "node db/sql/postgres/pg-query.js",
|
||||
"manual-sync": "tsx src/scripts/manualSync.ts"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
@@ -39,6 +40,7 @@
|
||||
"@types/dotenv": "^8.2.3",
|
||||
"@types/jsonwebtoken": "^9.0.6",
|
||||
"@types/node": "^20.11.30",
|
||||
"@types/pg": "^8.11.11",
|
||||
"@types/uuid": "^10.0.0",
|
||||
"@typescript-eslint/eslint-plugin": "^7.4.0",
|
||||
"@typescript-eslint/parser": "^7.4.0",
|
||||
|
||||
@@ -11,11 +11,9 @@ import commentsRouter from './routes/comments';
|
||||
import influencersRouter from './routes/influencers';
|
||||
import projectsRouter from './routes/projects';
|
||||
import { connectRedis } from './utils/redis';
|
||||
import { initClickHouse } from './utils/clickhouse';
|
||||
import { initWorkers } from './utils/queue';
|
||||
import { checkDatabaseConnection } from './utils/initDatabase';
|
||||
import { createSwaggerUI } from './swagger';
|
||||
import { initScheduledTaskWorkers } from './utils/scheduledTasks';
|
||||
import { initScheduledTaskWorkers, scheduleDatabaseSync } from './utils/scheduledTasks';
|
||||
|
||||
// Create Hono app
|
||||
const app = new Hono();
|
||||
@@ -64,23 +62,6 @@ const startServer = async () => {
|
||||
console.log('Continuing with mock Redis client...');
|
||||
}
|
||||
|
||||
// Initialize ClickHouse
|
||||
try {
|
||||
await initClickHouse();
|
||||
console.log('ClickHouse initialized');
|
||||
} catch (error) {
|
||||
console.error('Failed to initialize ClickHouse:', error);
|
||||
console.log('Continuing with limited analytics functionality...');
|
||||
}
|
||||
|
||||
// 检查数据库连接,但不自动初始化或修改数据库
|
||||
try {
|
||||
await checkDatabaseConnection();
|
||||
} catch (error) {
|
||||
console.error('Database connection check failed:', error);
|
||||
console.log('Some features may not work correctly if database is not properly set up');
|
||||
}
|
||||
|
||||
// Initialize workers for background processing
|
||||
console.log('🏗️ Initializing workers...');
|
||||
const workers = {
|
||||
@@ -88,6 +69,16 @@ const startServer = async () => {
|
||||
scheduledTaskWorker: initScheduledTaskWorkers()
|
||||
};
|
||||
|
||||
// Schedule regular database sync task (every 15 minutes by default)
|
||||
try {
|
||||
console.log('📊 Setting up database sync scheduled task...');
|
||||
await scheduleDatabaseSync();
|
||||
console.log('Database sync task scheduled successfully');
|
||||
} catch (error) {
|
||||
console.error('Failed to schedule database sync task:', error);
|
||||
console.log('Analytics data may not be automatically updated');
|
||||
}
|
||||
|
||||
// Start server
|
||||
const port = Number(config.port);
|
||||
console.log(`Server starting on port ${port}...`);
|
||||
|
||||
76
backend/src/scripts/manualSync.ts
Normal file
76
backend/src/scripts/manualSync.ts
Normal file
@@ -0,0 +1,76 @@
|
||||
#!/usr/bin/env ts-node
|
||||
/**
|
||||
* 手动运行PostgreSQL到ClickHouse的同步任务
|
||||
*
|
||||
* 使用方法:
|
||||
* npm run manual-sync
|
||||
*
|
||||
* 或者带时间参数:
|
||||
* npm run manual-sync -- --from="2025-03-11T00:00:00Z"
|
||||
*/
|
||||
|
||||
import { syncAllData } from '../services/syncService';
|
||||
import * as dotenv from 'dotenv';
|
||||
import { join } from 'path';
|
||||
|
||||
// 加载环境变量
|
||||
dotenv.config({ path: join(__dirname, '../../.env') });
|
||||
|
||||
// 解析命令行参数
|
||||
const getParam = (paramName: string): string | undefined => {
|
||||
const args = process.argv.slice(2);
|
||||
const param = args.find(arg => arg.startsWith(`--${paramName}=`));
|
||||
if (!param) return undefined;
|
||||
|
||||
return param.split('=')[1];
|
||||
};
|
||||
|
||||
async function main() {
|
||||
try {
|
||||
// 获取起始时间参数,默认为1小时前
|
||||
const fromTime = getParam('from');
|
||||
let fromTimestamp: string;
|
||||
|
||||
if (fromTime) {
|
||||
fromTimestamp = new Date(fromTime).toISOString();
|
||||
} else {
|
||||
const oneHourAgo = new Date();
|
||||
oneHourAgo.setHours(oneHourAgo.getHours() - 1);
|
||||
fromTimestamp = oneHourAgo.toISOString();
|
||||
}
|
||||
|
||||
console.log(`开始同步数据,起始时间: ${fromTimestamp}`);
|
||||
|
||||
// 运行同步
|
||||
const startTime = Date.now();
|
||||
const result = await syncAllData(fromTimestamp);
|
||||
const endTime = Date.now();
|
||||
|
||||
// 输出结果
|
||||
console.log('============================================');
|
||||
console.log('同步任务完成!');
|
||||
console.log('============================================');
|
||||
console.log(`总耗时: ${(endTime - startTime) / 1000} 秒`);
|
||||
console.log(`同步状态: ${result.success ? '成功' : '部分失败'}`);
|
||||
console.log(`帖子同步数量: ${result.posts}`);
|
||||
console.log(`评论同步数量: ${result.comments}`);
|
||||
console.log(`KOL变化同步数量: ${result.influencer_changes}`);
|
||||
console.log(`项目同步数量: ${result.projects}`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
console.log('============================================');
|
||||
console.log('错误信息:');
|
||||
result.errors.forEach((err, index) => {
|
||||
console.log(`${index + 1}. ${err}`);
|
||||
});
|
||||
}
|
||||
|
||||
process.exit(0);
|
||||
} catch (error) {
|
||||
console.error('运行同步任务时发生错误:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// 运行主函数
|
||||
main();
|
||||
707
backend/src/services/syncService.ts
Normal file
707
backend/src/services/syncService.ts
Normal file
@@ -0,0 +1,707 @@
|
||||
import { Pool } from 'pg';
|
||||
import supabase from '../utils/supabase';
|
||||
import clickhouse from '../utils/clickhouse';
|
||||
import config from '../config';
|
||||
import { randomUUID } from 'crypto';
|
||||
|
||||
// Define types for better type safety
|
||||
interface PostRecord {
|
||||
post_id: string;
|
||||
influencer_id: string;
|
||||
platform: string;
|
||||
project_id?: string;
|
||||
title?: string;
|
||||
description?: string;
|
||||
published_at: string;
|
||||
created_at: string;
|
||||
influencer_name?: string;
|
||||
followers_count?: number;
|
||||
}
|
||||
|
||||
interface CommentRecord {
|
||||
comment_id: string;
|
||||
post_id: string;
|
||||
user_id?: string;
|
||||
content: string;
|
||||
sentiment_score?: number;
|
||||
created_at: string;
|
||||
influencer_id: string;
|
||||
platform: string;
|
||||
project_id?: string;
|
||||
}
|
||||
|
||||
interface InfluencerRecord {
|
||||
influencer_id: string;
|
||||
name: string;
|
||||
platform: string;
|
||||
followers_count: number;
|
||||
video_count: number;
|
||||
updated_at: string;
|
||||
}
|
||||
|
||||
interface ProjectRecord {
|
||||
id: string;
|
||||
name: string;
|
||||
description?: string;
|
||||
created_at: string;
|
||||
}
|
||||
|
||||
interface SyncStats {
|
||||
success: boolean;
|
||||
timestamp: string;
|
||||
duration: number; // milliseconds
|
||||
posts_synced: number;
|
||||
comments_synced: number;
|
||||
influencer_changes_synced: number;
|
||||
projects_synced: number;
|
||||
errors: string[];
|
||||
}
|
||||
|
||||
// Initialize PostgreSQL client
|
||||
const pgPool = new Pool({
|
||||
connectionString: process.env.DATABASE_URL || 'postgresql://postgres:postgres@localhost:5432/promote',
|
||||
});
|
||||
|
||||
// Batch size
|
||||
const BATCH_SIZE = 100;
|
||||
|
||||
/**
|
||||
* Submits sync stats to ClickHouse
|
||||
* @param stats Sync stats
|
||||
*/
|
||||
async function recordSyncStats(stats: SyncStats): Promise<void> {
|
||||
try {
|
||||
// 首先检查表是否存在,如果不存在则创建
|
||||
await clickhouse.query({
|
||||
query: `
|
||||
CREATE TABLE IF NOT EXISTS ${config.clickhouse.database}.sync_logs (
|
||||
timestamp DateTime,
|
||||
duration_ms UInt32,
|
||||
posts_synced UInt32,
|
||||
comments_synced UInt32,
|
||||
influencer_changes_synced UInt32,
|
||||
projects_synced UInt32,
|
||||
success UInt8,
|
||||
error_messages String
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY (timestamp)
|
||||
`
|
||||
});
|
||||
|
||||
// 构建INSERT语句
|
||||
const insertQuery = `
|
||||
INSERT INTO ${config.clickhouse.database}.sync_logs
|
||||
(timestamp, duration_ms, posts_synced, comments_synced, influencer_changes_synced,
|
||||
projects_synced, success, error_messages)
|
||||
VALUES ('${stats.timestamp}', ${stats.duration}, ${stats.posts_synced},
|
||||
${stats.comments_synced}, ${stats.influencer_changes_synced},
|
||||
${stats.projects_synced}, ${stats.success ? 1 : 0}, '${stats.errors.join('; ').replace(/'/g, "\\'")}')`
|
||||
|
||||
console.log('[DEBUG] 要执行的同步统计插入语句:', insertQuery);
|
||||
|
||||
// 注释掉实际执行的代码
|
||||
// await clickhouse.query({
|
||||
// query: insertQuery
|
||||
// });
|
||||
} catch (error) {
|
||||
console.error('Failed to record sync stats:', error);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 转义ClickHouse字符串中的特殊字符
|
||||
*/
|
||||
function escapeClickHouseString(str: string): string {
|
||||
if (!str) return '';
|
||||
return str.replace(/'/g, "\\'");
|
||||
}
|
||||
|
||||
/**
|
||||
* Syncs new posts from PostgreSQL to ClickHouse
|
||||
* @param lastSyncTimestamp The timestamp of the last sync
|
||||
*/
|
||||
export async function syncNewPosts(lastSyncTimestamp: string): Promise<number> {
|
||||
try {
|
||||
// Get new posts from PostgreSQL
|
||||
const query = `
|
||||
SELECT
|
||||
p.post_id,
|
||||
p.influencer_id,
|
||||
p.platform,
|
||||
p.project_id,
|
||||
p.title,
|
||||
p.description,
|
||||
p.published_at,
|
||||
p.created_at,
|
||||
i.name as influencer_name,
|
||||
i.followers_count
|
||||
FROM posts p
|
||||
JOIN influencers i ON p.influencer_id = i.influencer_id
|
||||
WHERE p.created_at > $1
|
||||
ORDER BY p.created_at
|
||||
`;
|
||||
|
||||
const { rows: posts } = await pgPool.query<PostRecord>(query, [lastSyncTimestamp]);
|
||||
|
||||
if (posts.length === 0) {
|
||||
console.log('No new posts to sync');
|
||||
return 0;
|
||||
}
|
||||
|
||||
console.log(`Found ${posts.length} new posts to sync`);
|
||||
|
||||
let syncedCount = 0;
|
||||
|
||||
// Batch processing to avoid processing too much data at once
|
||||
for (let i = 0; i < posts.length; i += BATCH_SIZE) {
|
||||
const batch = posts.slice(i, i + BATCH_SIZE);
|
||||
|
||||
try {
|
||||
// 准备批量插入的值部分
|
||||
const values = batch.map(post => {
|
||||
const eventId = randomUUID();
|
||||
const timestamp = new Date(post.created_at).toISOString();
|
||||
const date = timestamp.split('T')[0];
|
||||
const hour = new Date(post.created_at).getHours();
|
||||
const contentType = determineContentType(post.title || '', post.description || '');
|
||||
const keywords = JSON.stringify(extractKeywords(post.title || ''));
|
||||
|
||||
return `('${eventId}', '${timestamp}', '${date}', ${hour}, '', '${post.influencer_id}', '${post.post_id}', '${post.project_id || ''}', 'impression', 'exposure', '${escapeClickHouseString(post.platform)}', '${contentType}', 'approved', 'neutral', '', ${keywords}, 1.0, ${post.followers_count || 0}, 0, 0, 0, 0, '', '', '', '', '', '', '')`;
|
||||
}).join(', ');
|
||||
|
||||
// 构建完整插入查询
|
||||
const insertQuery = `
|
||||
INSERT INTO ${config.clickhouse.database}.events
|
||||
(event_id, timestamp, date, hour, user_id, influencer_id, content_id, project_id,
|
||||
event_type, funnel_stage, platform, content_type, content_status, sentiment,
|
||||
comment_text, keywords, interaction_value, followers_count, followers_change,
|
||||
likes_count, likes_change, views_count, ip, user_agent, device_type, referrer,
|
||||
geo_country, geo_city, session_id)
|
||||
VALUES ${values}`;
|
||||
|
||||
console.log(`[DEBUG] 批次 ${i / BATCH_SIZE + 1} 帖子插入语句 (前500字符): ${insertQuery.substring(0, 500)}...`);
|
||||
|
||||
// 看看values的值
|
||||
if (batch.length > 0) {
|
||||
console.log(`[DEBUG] 第一条帖子数据值: ${values.split('),')[0]})`);
|
||||
}
|
||||
|
||||
// 注释掉实际执行的代码
|
||||
// await clickhouse.query({
|
||||
// query: insertQuery
|
||||
// });
|
||||
|
||||
syncedCount += batch.length;
|
||||
console.log(`[DEBUG] 模拟同步批次 ${batch.length} 帖子 (${syncedCount}/${posts.length})`);
|
||||
} catch (error) {
|
||||
console.error(`Error syncing post batch ${i / BATCH_SIZE + 1}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[DEBUG] 模拟成功同步 ${syncedCount} 帖子到 ClickHouse`);
|
||||
return syncedCount;
|
||||
} catch (error) {
|
||||
console.error('Error syncing new posts:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Syncs new comments from PostgreSQL to ClickHouse
|
||||
* @param lastSyncTimestamp The timestamp of the last sync
|
||||
*/
|
||||
export async function syncComments(lastSyncTimestamp: string): Promise<number> {
|
||||
try {
|
||||
// Get new comments from PostgreSQL
|
||||
const query = `
|
||||
SELECT
|
||||
c.comment_id,
|
||||
c.post_id,
|
||||
c.user_id,
|
||||
c.content,
|
||||
c.sentiment_score,
|
||||
c.created_at,
|
||||
p.influencer_id,
|
||||
p.platform,
|
||||
p.project_id
|
||||
FROM comments c
|
||||
JOIN posts p ON c.post_id = p.post_id
|
||||
WHERE c.created_at > $1
|
||||
ORDER BY c.created_at
|
||||
`;
|
||||
|
||||
const { rows: comments } = await pgPool.query<CommentRecord>(query, [lastSyncTimestamp]);
|
||||
|
||||
if (comments.length === 0) {
|
||||
console.log('No new comments to sync');
|
||||
return 0;
|
||||
}
|
||||
|
||||
console.log(`Found ${comments.length} new comments to sync`);
|
||||
|
||||
let syncedCount = 0;
|
||||
|
||||
// Batch processing to avoid processing too much data at once
|
||||
for (let i = 0; i < comments.length; i += BATCH_SIZE) {
|
||||
const batch = comments.slice(i, i + BATCH_SIZE);
|
||||
|
||||
try {
|
||||
// 准备批量插入的值部分
|
||||
const values = batch.map(comment => {
|
||||
const eventId = randomUUID();
|
||||
const timestamp = new Date(comment.created_at).toISOString();
|
||||
const date = timestamp.split('T')[0];
|
||||
const hour = new Date(comment.created_at).getHours();
|
||||
const sentiment = determineSentiment(comment.sentiment_score || 0);
|
||||
const keywords = JSON.stringify(extractKeywords(comment.content));
|
||||
const escapedComment = escapeClickHouseString(comment.content);
|
||||
|
||||
return `('${eventId}', '${timestamp}', '${date}', ${hour}, '${comment.user_id || ''}', '${comment.influencer_id}', '${comment.post_id}', '${comment.project_id || ''}', 'comment', 'consideration', '${escapeClickHouseString(comment.platform)}', 'text', 'approved', '${sentiment}', '${escapedComment}', ${keywords}, 3.0, 0, 0, 0, 0, 0, '', '', '', '', '', '', '')`;
|
||||
}).join(', ');
|
||||
|
||||
// 构建完整插入查询
|
||||
const insertQuery = `
|
||||
INSERT INTO ${config.clickhouse.database}.events
|
||||
(event_id, timestamp, date, hour, user_id, influencer_id, content_id, project_id,
|
||||
event_type, funnel_stage, platform, content_type, content_status, sentiment,
|
||||
comment_text, keywords, interaction_value, followers_count, followers_change,
|
||||
likes_count, likes_change, views_count, ip, user_agent, device_type, referrer,
|
||||
geo_country, geo_city, session_id)
|
||||
VALUES ${values}`;
|
||||
|
||||
console.log(`[DEBUG] 批次 ${i / BATCH_SIZE + 1} 评论插入语句 (前500字符): ${insertQuery.substring(0, 500)}...`);
|
||||
|
||||
// 看看values的值
|
||||
if (batch.length > 0) {
|
||||
console.log(`[DEBUG] 第一条评论数据值: ${values.split('),')[0]})`);
|
||||
}
|
||||
|
||||
// 注释掉实际执行的代码
|
||||
// await clickhouse.query({
|
||||
// query: insertQuery
|
||||
// });
|
||||
|
||||
syncedCount += batch.length;
|
||||
console.log(`[DEBUG] 模拟同步批次 ${batch.length} 评论 (${syncedCount}/${comments.length})`);
|
||||
} catch (error) {
|
||||
console.error(`Error syncing comment batch ${i / BATCH_SIZE + 1}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[DEBUG] 模拟成功同步 ${syncedCount} 评论到 ClickHouse`);
|
||||
return syncedCount;
|
||||
} catch (error) {
|
||||
console.error('Error syncing new comments:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Syncs project information from PostgreSQL to ClickHouse
|
||||
* @param lastSyncTimestamp The timestamp of the last sync
|
||||
*/
|
||||
export async function syncProjects(lastSyncTimestamp: string): Promise<number> {
|
||||
try {
|
||||
// Get new projects and updated projects from PostgreSQL
|
||||
const query = `
|
||||
SELECT
|
||||
id,
|
||||
name,
|
||||
description,
|
||||
created_at
|
||||
FROM projects
|
||||
WHERE created_at > $1 OR updated_at > $1
|
||||
ORDER BY created_at
|
||||
`;
|
||||
|
||||
const { rows: projects } = await pgPool.query<ProjectRecord>(query, [lastSyncTimestamp]);
|
||||
|
||||
if (projects.length === 0) {
|
||||
console.log('No new projects to sync');
|
||||
return 0;
|
||||
}
|
||||
|
||||
console.log(`Found ${projects.length} projects to sync`);
|
||||
|
||||
let syncedCount = 0;
|
||||
|
||||
// Batch processing
|
||||
for (let i = 0; i < projects.length; i += BATCH_SIZE) {
|
||||
const batch = projects.slice(i, i + BATCH_SIZE);
|
||||
|
||||
try {
|
||||
// 准备批量插入的值部分
|
||||
const values = batch.map(project => {
|
||||
const eventId = randomUUID();
|
||||
const timestamp = new Date(project.created_at).toISOString();
|
||||
const date = timestamp.split('T')[0];
|
||||
const hour = new Date(project.created_at).getHours();
|
||||
const keywords = JSON.stringify(extractKeywords(project.name + ' ' + (project.description || '')));
|
||||
const escapedDesc = escapeClickHouseString(project.description || '');
|
||||
|
||||
return `('${eventId}', '${timestamp}', '${date}', ${hour}, '', '', '', '${project.id}', 'project_update', 'interest', 'internal', 'text', 'approved', 'neutral', '${escapedDesc}', ${keywords}, 5.0, 0, 0, 0, 0, 0, '', '', '', '', '', '', '')`;
|
||||
}).join(', ');
|
||||
|
||||
// 构建完整插入查询
|
||||
const insertQuery = `
|
||||
INSERT INTO ${config.clickhouse.database}.events
|
||||
(event_id, timestamp, date, hour, user_id, influencer_id, content_id, project_id,
|
||||
event_type, funnel_stage, platform, content_type, content_status, sentiment,
|
||||
comment_text, keywords, interaction_value, followers_count, followers_change,
|
||||
likes_count, likes_change, views_count, ip, user_agent, device_type, referrer,
|
||||
geo_country, geo_city, session_id)
|
||||
VALUES ${values}`;
|
||||
|
||||
console.log(`[DEBUG] 批次 ${i / BATCH_SIZE + 1} 项目插入语句 (前500字符): ${insertQuery.substring(0, 500)}...`);
|
||||
|
||||
// 看看values的值
|
||||
if (batch.length > 0) {
|
||||
console.log(`[DEBUG] 第一条项目数据值: ${values.split('),')[0]})`);
|
||||
}
|
||||
|
||||
// 注释掉实际执行的代码
|
||||
// await clickhouse.query({
|
||||
// query: insertQuery
|
||||
// });
|
||||
|
||||
syncedCount += batch.length;
|
||||
console.log(`[DEBUG] 模拟同步批次 ${batch.length} 项目 (${syncedCount}/${projects.length})`);
|
||||
} catch (error) {
|
||||
console.error(`Error syncing project batch ${i / BATCH_SIZE + 1}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[DEBUG] 模拟成功同步 ${syncedCount} 项目到 ClickHouse`);
|
||||
return syncedCount;
|
||||
} catch (error) {
|
||||
console.error('Error syncing projects:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Syncs influencer metric changes from PostgreSQL to ClickHouse
|
||||
* @param lastSyncTimestamp The timestamp of the last sync
|
||||
*/
|
||||
export async function syncInfluencerChanges(lastSyncTimestamp: string): Promise<number> {
|
||||
try {
|
||||
// Get influencers with updated metrics
|
||||
const query = `
|
||||
SELECT
|
||||
i.influencer_id,
|
||||
i.name,
|
||||
i.platform,
|
||||
i.followers_count,
|
||||
i.video_count,
|
||||
i.updated_at
|
||||
FROM influencers i
|
||||
WHERE i.updated_at > $1
|
||||
ORDER BY i.updated_at
|
||||
`;
|
||||
|
||||
const { rows: influencers } = await pgPool.query<InfluencerRecord>(query, [lastSyncTimestamp]);
|
||||
|
||||
if (influencers.length === 0) {
|
||||
console.log('No influencer changes to sync');
|
||||
return 0;
|
||||
}
|
||||
|
||||
console.log(`Found ${influencers.length} influencer changes to sync`);
|
||||
|
||||
let syncedCount = 0;
|
||||
let batchEvents: string[] = [];
|
||||
|
||||
// 从ClickHouse获取所有相关的影响者的最新一条记录
|
||||
if (influencers.length > 0) {
|
||||
try {
|
||||
const influencerIds = influencers.map(i => `'${i.influencer_id}'`).join(',');
|
||||
const result = await clickhouse.query({
|
||||
query: `
|
||||
SELECT
|
||||
influencer_id AS id,
|
||||
followers_count,
|
||||
max(timestamp) AS last_update
|
||||
FROM ${config.clickhouse.database}.events
|
||||
WHERE influencer_id IN (${influencerIds})
|
||||
AND event_type IN ('follow', 'unfollow', 'impression')
|
||||
GROUP BY influencer_id, followers_count
|
||||
ORDER BY last_update DESC
|
||||
`,
|
||||
format: 'JSONEachRow'
|
||||
});
|
||||
|
||||
// 将结果转换为对象,以便快速查找
|
||||
const prevMetricsMap = new Map<string, { id: string; followers_count: number; last_update: string }>();
|
||||
|
||||
// 获取结果中的数据
|
||||
try {
|
||||
// 尝试解析结果
|
||||
if ('rows' in result) {
|
||||
// 如果结果有rows属性,直接使用
|
||||
for (const record of result.rows as any[]) {
|
||||
if (!prevMetricsMap.has(record.id) ||
|
||||
new Date(record.last_update) > new Date(prevMetricsMap.get(record.id)!.last_update)) {
|
||||
prevMetricsMap.set(record.id, record);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// 否则尝试转换结果为JSON
|
||||
// 使用同步方法处理结果,避免使用text()方法
|
||||
const rows: any[] = [];
|
||||
try {
|
||||
// 检查是否有替代方法
|
||||
if (typeof result.json === 'function') {
|
||||
const jsonData = await result.json();
|
||||
if (Array.isArray(jsonData)) {
|
||||
rows.push(...jsonData);
|
||||
}
|
||||
} else {
|
||||
// 假设结果是ResultSet或类似结构
|
||||
console.log('Warning: Using fallback method to process query results');
|
||||
// 无法直接处理结果,使用空数组继续
|
||||
}
|
||||
} catch (parseError) {
|
||||
console.error('Error parsing ClickHouse result:', parseError);
|
||||
}
|
||||
|
||||
for (const record of rows) {
|
||||
const typedRecord = record as { id: string; followers_count: number; last_update: string };
|
||||
if (!prevMetricsMap.has(typedRecord.id) ||
|
||||
new Date(typedRecord.last_update) > new Date(prevMetricsMap.get(typedRecord.id)!.last_update)) {
|
||||
prevMetricsMap.set(typedRecord.id, typedRecord);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Error processing ClickHouse result:', e);
|
||||
}
|
||||
|
||||
// 处理每个影响者的变化
|
||||
for (const influencer of influencers) {
|
||||
try {
|
||||
// 获取之前的指标
|
||||
const prevMetrics = prevMetricsMap.get(influencer.influencer_id);
|
||||
const prevFollowersCount = prevMetrics ? Number(prevMetrics.followers_count) || 0 : 0;
|
||||
|
||||
// 计算粉丝变化
|
||||
const followersChange = influencer.followers_count - prevFollowersCount;
|
||||
|
||||
// 只有在有实际变化时才创建事件
|
||||
if (followersChange !== 0) {
|
||||
const eventId = randomUUID();
|
||||
const timestamp = new Date(influencer.updated_at).toISOString();
|
||||
const date = timestamp.split('T')[0];
|
||||
const hour = new Date(influencer.updated_at).getHours();
|
||||
const eventType = followersChange > 0 ? 'follow' : 'unfollow';
|
||||
|
||||
batchEvents.push(`('${eventId}', '${timestamp}', '${date}', ${hour}, '', '${influencer.influencer_id}', '', '', '${eventType}', 'interest', '${escapeClickHouseString(influencer.platform)}', 'text', 'approved', 'neutral', '', '[]', 2.0, ${influencer.followers_count}, ${followersChange}, 0, 0, 0, '', '', '', '', '', '', '')`);
|
||||
|
||||
syncedCount++;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error processing influencer ${influencer.influencer_id}:`, error);
|
||||
// 继续处理下一个影响者
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error querying previous metrics:', error);
|
||||
}
|
||||
}
|
||||
|
||||
// 如果有要插入的事件,批量插入
|
||||
if (batchEvents.length > 0) {
|
||||
try {
|
||||
// 构建完整插入查询
|
||||
const insertQuery = `
|
||||
INSERT INTO ${config.clickhouse.database}.events
|
||||
(event_id, timestamp, date, hour, user_id, influencer_id, content_id, project_id,
|
||||
event_type, funnel_stage, platform, content_type, content_status, sentiment,
|
||||
comment_text, keywords, interaction_value, followers_count, followers_change,
|
||||
likes_count, likes_change, views_count, ip, user_agent, device_type, referrer,
|
||||
geo_country, geo_city, session_id)
|
||||
VALUES ${batchEvents.join(', ')}`;
|
||||
|
||||
console.log(`[DEBUG] KOL变化插入语句 (前500字符): ${insertQuery.substring(0, 500)}...`);
|
||||
|
||||
// 看看values的值
|
||||
if (batchEvents.length > 0) {
|
||||
console.log(`[DEBUG] 第一条KOL变化数据值: ${batchEvents[0]}`);
|
||||
}
|
||||
|
||||
// 注释掉实际执行的代码
|
||||
// await clickhouse.query({
|
||||
// query: insertQuery
|
||||
// });
|
||||
|
||||
console.log(`[DEBUG] 模拟同步 ${batchEvents.length} KOL变化`);
|
||||
} catch (error) {
|
||||
console.error(`Error syncing influencer batch:`, error);
|
||||
syncedCount = 0; // 失败时重置同步计数
|
||||
}
|
||||
} else {
|
||||
console.log('No follower changes detected, skipping influencer sync');
|
||||
}
|
||||
|
||||
console.log(`[DEBUG] 模拟成功同步 ${syncedCount} KOL变化到 ClickHouse`);
|
||||
return syncedCount;
|
||||
} catch (error) {
|
||||
console.error('Error syncing influencer changes:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Syncs all data from PostgreSQL to ClickHouse
|
||||
* @param lastSyncTimestamp The timestamp of the last sync
|
||||
*/
|
||||
export async function syncAllData(lastSyncTimestamp: string): Promise<{
|
||||
posts: number;
|
||||
comments: number;
|
||||
influencer_changes: number;
|
||||
projects: number;
|
||||
success: boolean;
|
||||
errors: string[];
|
||||
duration: number;
|
||||
}> {
|
||||
const startTime = Date.now();
|
||||
const errors: string[] = [];
|
||||
let postsCount = 0;
|
||||
let commentsCount = 0;
|
||||
let influencerChangesCount = 0;
|
||||
let projectsCount = 0;
|
||||
let success = true;
|
||||
|
||||
try {
|
||||
// Sync new posts
|
||||
try {
|
||||
postsCount = await syncNewPosts(lastSyncTimestamp);
|
||||
} catch (error: unknown) {
|
||||
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
||||
errors.push(`Posts sync error: ${errorMessage}`);
|
||||
success = false;
|
||||
}
|
||||
|
||||
// Sync new comments
|
||||
try {
|
||||
commentsCount = await syncComments(lastSyncTimestamp);
|
||||
} catch (error: unknown) {
|
||||
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
||||
errors.push(`Comments sync error: ${errorMessage}`);
|
||||
success = false;
|
||||
}
|
||||
|
||||
// Sync influencer changes
|
||||
try {
|
||||
influencerChangesCount = await syncInfluencerChanges(lastSyncTimestamp);
|
||||
} catch (error: unknown) {
|
||||
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
||||
errors.push(`Influencer changes sync error: ${errorMessage}`);
|
||||
success = false;
|
||||
}
|
||||
|
||||
// Sync projects
|
||||
try {
|
||||
projectsCount = await syncProjects(lastSyncTimestamp);
|
||||
} catch (error: unknown) {
|
||||
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
||||
errors.push(`Projects sync error: ${errorMessage}`);
|
||||
success = false;
|
||||
}
|
||||
|
||||
// Record sync stats
|
||||
const endTime = Date.now();
|
||||
const duration = endTime - startTime;
|
||||
const syncStats: SyncStats = {
|
||||
success,
|
||||
timestamp: new Date().toISOString(),
|
||||
duration,
|
||||
posts_synced: postsCount,
|
||||
comments_synced: commentsCount,
|
||||
influencer_changes_synced: influencerChangesCount,
|
||||
projects_synced: projectsCount,
|
||||
errors
|
||||
};
|
||||
|
||||
await recordSyncStats(syncStats);
|
||||
|
||||
return {
|
||||
posts: postsCount,
|
||||
comments: commentsCount,
|
||||
influencer_changes: influencerChangesCount,
|
||||
projects: projectsCount,
|
||||
success,
|
||||
errors,
|
||||
duration
|
||||
};
|
||||
} catch (error: unknown) {
|
||||
console.error('Error in syncAllData:', error);
|
||||
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
||||
return {
|
||||
posts: postsCount,
|
||||
comments: commentsCount,
|
||||
influencer_changes: influencerChangesCount,
|
||||
projects: projectsCount,
|
||||
success: false,
|
||||
errors: [...errors, `General sync error: ${errorMessage}`],
|
||||
duration: Date.now() - startTime
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to determine content type based on title/description
|
||||
*/
|
||||
function determineContentType(title: string, description: string = ''): string {
|
||||
const text = (title + ' ' + description).toLowerCase();
|
||||
|
||||
if (text.includes('video') || text.includes('watch') || text.includes('视频')) return 'video';
|
||||
if (text.includes('image') || text.includes('photo') || text.includes('pic') || text.includes('图片')) return 'image';
|
||||
if (text.includes('story') || text.includes('故事')) return 'story';
|
||||
if (text.includes('reel') || text.includes('短视频')) return 'reel';
|
||||
if (text.includes('live') || text.includes('直播')) return 'live';
|
||||
|
||||
// Default
|
||||
return 'text';
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to determine sentiment from score
|
||||
*/
|
||||
function determineSentiment(score: number): string {
|
||||
if (!score && score !== 0) return 'neutral';
|
||||
|
||||
if (score > 0.3) return 'positive';
|
||||
if (score < -0.3) return 'negative';
|
||||
return 'neutral';
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to extract keywords from text
|
||||
*/
|
||||
function extractKeywords(text: string): string[] {
|
||||
if (!text) return [];
|
||||
|
||||
// Convert to lowercase
|
||||
const lower = text.toLowerCase();
|
||||
|
||||
// Remove special characters and split into words
|
||||
const words = lower.replace(/[^\w\s]/g, ' ').split(/\s+/);
|
||||
|
||||
// Filter out common words (simple stop words list)
|
||||
const stopWords = new Set([
|
||||
'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'with',
|
||||
'about', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
|
||||
'had', 'do', 'does', 'did', 'i', 'you', 'he', 'she', 'it', 'we', 'they',
|
||||
'this', 'that', 'these', 'those', 'of', 'by', 'from', 'as', 'if', 'then',
|
||||
'than', 'so', 'what', 'when', 'where', 'how', 'all', 'any', 'both', 'each',
|
||||
'我', '你', '他', '她', '它', '们', '的', '和', '是', '在', '了', '有', '就',
|
||||
'都', '而', '及', '与', '这', '那', '不', '但', '如', '要', '可以', '会'
|
||||
]);
|
||||
|
||||
const keywords = words
|
||||
.filter(word => word.length > 2) // Filter out short words
|
||||
.filter(word => !stopWords.has(word)) // Filter out stop words
|
||||
.slice(0, 10); // Limit to 10 keywords
|
||||
|
||||
return [...new Set(keywords)]; // Remove duplicates
|
||||
}
|
||||
@@ -27,60 +27,5 @@ const createClickHouseClient = () => {
|
||||
|
||||
const clickhouse = createClickHouseClient();
|
||||
|
||||
// Initialize ClickHouse database and tables
|
||||
export const initClickHouse = async () => {
|
||||
try {
|
||||
// Create database if not exists
|
||||
await clickhouse.query({
|
||||
query: `CREATE DATABASE IF NOT EXISTS ${config.clickhouse.database}`,
|
||||
});
|
||||
|
||||
// Create tables for tracking events
|
||||
await clickhouse.query({
|
||||
query: `
|
||||
CREATE TABLE IF NOT EXISTS ${config.clickhouse.database}.view_events (
|
||||
user_id String,
|
||||
content_id String,
|
||||
timestamp DateTime DEFAULT now(),
|
||||
ip String,
|
||||
user_agent String
|
||||
) ENGINE = MergeTree()
|
||||
PARTITION BY toYYYYMM(timestamp)
|
||||
ORDER BY (user_id, content_id, timestamp)
|
||||
`,
|
||||
});
|
||||
|
||||
await clickhouse.query({
|
||||
query: `
|
||||
CREATE TABLE IF NOT EXISTS ${config.clickhouse.database}.like_events (
|
||||
user_id String,
|
||||
content_id String,
|
||||
timestamp DateTime DEFAULT now(),
|
||||
action Enum('like' = 1, 'unlike' = 2)
|
||||
) ENGINE = MergeTree()
|
||||
PARTITION BY toYYYYMM(timestamp)
|
||||
ORDER BY (user_id, content_id, timestamp)
|
||||
`,
|
||||
});
|
||||
|
||||
await clickhouse.query({
|
||||
query: `
|
||||
CREATE TABLE IF NOT EXISTS ${config.clickhouse.database}.follower_events (
|
||||
follower_id String,
|
||||
followed_id String,
|
||||
timestamp DateTime DEFAULT now(),
|
||||
action Enum('follow' = 1, 'unfollow' = 2)
|
||||
) ENGINE = MergeTree()
|
||||
PARTITION BY toYYYYMM(timestamp)
|
||||
ORDER BY (follower_id, followed_id, timestamp)
|
||||
`,
|
||||
});
|
||||
|
||||
console.log('ClickHouse database and tables initialized');
|
||||
} catch (error) {
|
||||
console.error('Error initializing ClickHouse:', error);
|
||||
console.log('Continuing with limited functionality...');
|
||||
}
|
||||
};
|
||||
|
||||
export default clickhouse;
|
||||
@@ -532,29 +532,3 @@ export const checkDatabaseConnection = async () => {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* 初始化数据库 - 此函数现在仅作为手动初始化的入口点
|
||||
* 只有通过管理API明确调用时才会执行实际的初始化
|
||||
*/
|
||||
export const initDatabase = async () => {
|
||||
try {
|
||||
console.log('开始数据库初始化...');
|
||||
console.log('警告: 此操作将修改数据库结构,请确保您知道自己在做什么');
|
||||
|
||||
// 初始化 Supabase 函数
|
||||
await initSupabaseFunctions();
|
||||
|
||||
// 初始化 Supabase 表
|
||||
await initSupabaseTables();
|
||||
|
||||
// 初始化 ClickHouse 表
|
||||
await initClickHouseTables();
|
||||
|
||||
console.log('数据库初始化完成');
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error('数据库初始化失败:', error);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
@@ -2,12 +2,14 @@ import { Queue, Worker } from 'bullmq';
|
||||
import supabase from './supabase';
|
||||
import clickhouse from './clickhouse';
|
||||
import { getRedisClient } from './redis';
|
||||
import { syncAllData } from '../services/syncService';
|
||||
|
||||
interface ScheduledCollectionData {
|
||||
type: 'influencer_metrics' | 'post_metrics';
|
||||
type: 'influencer_metrics' | 'post_metrics' | 'data_sync';
|
||||
influencer_id?: string;
|
||||
post_id?: string;
|
||||
project_id?: string;
|
||||
last_sync_timestamp?: string;
|
||||
}
|
||||
|
||||
// Create a mock scheduler if BullMQ doesn't export QueueScheduler
|
||||
@@ -53,13 +55,15 @@ export const initScheduledTaskWorkers = () => {
|
||||
'scheduled-data-collection',
|
||||
async (job) => {
|
||||
console.log(`Processing scheduled task: ${job.id}`, job.data);
|
||||
const { type, influencer_id, post_id, project_id } = job.data as ScheduledCollectionData;
|
||||
const { type, influencer_id, post_id, project_id, last_sync_timestamp } = job.data as ScheduledCollectionData;
|
||||
|
||||
try {
|
||||
if (type === 'influencer_metrics') {
|
||||
await collectInfluencerMetrics(influencer_id);
|
||||
} else if (type === 'post_metrics') {
|
||||
await collectPostMetrics(post_id);
|
||||
} else if (type === 'data_sync') {
|
||||
await syncPostgresToClickhouse(last_sync_timestamp);
|
||||
}
|
||||
|
||||
console.log(`Successfully completed scheduled task: ${job.id}`);
|
||||
@@ -404,3 +408,80 @@ async function collectPostMetrics(postId?: string) {
|
||||
changes
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Schedules a task to sync data from PostgreSQL to ClickHouse
|
||||
* @param cronExpression The cron expression for scheduling (default: every 15 minutes)
|
||||
*/
|
||||
export const scheduleDatabaseSync = async (
|
||||
cronExpression: string = '*/15 * * * *' // Default: Every 15 minutes
|
||||
) => {
|
||||
const queue = await createScheduledTaskQueue();
|
||||
|
||||
// Get current timestamp as the initial last sync time
|
||||
const currentTimestamp = new Date().toISOString();
|
||||
|
||||
const jobName = 'postgres-to-clickhouse-sync';
|
||||
|
||||
// Remove existing job if any
|
||||
const repeatableJobs = await queue.getRepeatableJobs();
|
||||
const existingJob = repeatableJobs.find(job => job.name === jobName);
|
||||
|
||||
if (existingJob) {
|
||||
await queue.removeRepeatableByKey(existingJob.key);
|
||||
console.log(`Removed existing sync job: ${jobName}`);
|
||||
}
|
||||
|
||||
// Add new repeatable job
|
||||
const job = await queue.add(
|
||||
jobName,
|
||||
{
|
||||
type: 'data_sync',
|
||||
last_sync_timestamp: currentTimestamp,
|
||||
},
|
||||
{
|
||||
repeat: {
|
||||
pattern: cronExpression,
|
||||
},
|
||||
removeOnComplete: {
|
||||
age: 24 * 3600, // Keep completed jobs for 24 hours
|
||||
count: 100, // Keep at most 100 jobs
|
||||
},
|
||||
removeOnFail: false, // Do not remove failed jobs to track failures
|
||||
}
|
||||
);
|
||||
|
||||
console.log(`Scheduled PostgreSQL to ClickHouse sync job: ${job.id}, pattern: ${cronExpression}`);
|
||||
return job;
|
||||
};
|
||||
|
||||
/**
|
||||
* Syncs data from PostgreSQL to ClickHouse
|
||||
* @param lastSyncTimestamp The timestamp of the last sync
|
||||
*/
|
||||
async function syncPostgresToClickhouse(lastSyncTimestamp?: string): Promise<{
|
||||
posts: number;
|
||||
comments: number;
|
||||
influencerChanges: number;
|
||||
}> {
|
||||
console.log(`Starting PostgreSQL to ClickHouse sync from timestamp: ${lastSyncTimestamp}`);
|
||||
|
||||
// If no last sync timestamp provided, use a timestamp from 1 hour ago
|
||||
if (!lastSyncTimestamp) {
|
||||
const oneHourAgo = new Date();
|
||||
oneHourAgo.setHours(oneHourAgo.getHours() - 1);
|
||||
lastSyncTimestamp = oneHourAgo.toISOString();
|
||||
}
|
||||
|
||||
try {
|
||||
// Sync all data
|
||||
const result = await syncAllData(lastSyncTimestamp);
|
||||
|
||||
console.log('PostgreSQL to ClickHouse sync completed:', result);
|
||||
|
||||
return result;
|
||||
} catch (error) {
|
||||
console.error('Error in PostgreSQL to ClickHouse sync:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user